3468 lines
122 KiB
Plaintext
3468 lines
122 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование\n",
|
||
"\n",
|
||
"Преобразование категориального признака в несколько бинарных признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка набора данных World Population"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>Yearly</th>\n",
|
||
" <th>NetChange</th>\n",
|
||
" <th>Density</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>Migrants</th>\n",
|
||
" <th>FertRate</th>\n",
|
||
" <th>MedAge</th>\n",
|
||
" <th>UrbanPop</th>\n",
|
||
" <th>WorldShare</th>\n",
|
||
" <th>Net Change</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5,540,090</td>\n",
|
||
" <td>153</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>-348,399</td>\n",
|
||
" <td>1.7</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>61%</td>\n",
|
||
" <td>18.47%</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13,586,631</td>\n",
|
||
" <td>464</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>-532,687</td>\n",
|
||
" <td>2.2</td>\n",
|
||
" <td>28</td>\n",
|
||
" <td>35%</td>\n",
|
||
" <td>17.70%</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1,937,734</td>\n",
|
||
" <td>36</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>954,806</td>\n",
|
||
" <td>1.8</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>83%</td>\n",
|
||
" <td>4.25%</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2,898,047</td>\n",
|
||
" <td>151</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>-98,955</td>\n",
|
||
" <td>2.3</td>\n",
|
||
" <td>30</td>\n",
|
||
" <td>56%</td>\n",
|
||
" <td>3.51%</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4,327,022</td>\n",
|
||
" <td>287</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>-233,379</td>\n",
|
||
" <td>3.6</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>35%</td>\n",
|
||
" <td>2.83%</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>10%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>66%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>103</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>46%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>136</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>0%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2,003</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 12 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 Yearly NetChange Density LandArea \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5,540,090 153 9388211 \n",
|
||
"2 India 1380004385 0.99 13,586,631 464 2973190 \n",
|
||
"3 United States 331002651 0.59 1,937,734 36 9147420 \n",
|
||
"4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n",
|
||
"5 Pakistan 220892340 2.00 4,327,022 287 770880 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 50 100 \n",
|
||
"232 Falkland Islands 3480 3.05 103 0 12170 \n",
|
||
"233 Niue 1626 0.68 11 6 260 \n",
|
||
"234 Tokelau 1357 1.27 17 136 10 \n",
|
||
"235 Holy See 801 0.25 2 2,003 0 \n",
|
||
"\n",
|
||
" Migrants FertRate MedAge UrbanPop WorldShare Net Change \n",
|
||
"no \n",
|
||
"1 -348,399 1.7 38 61% 18.47% 5540090 \n",
|
||
"2 -532,687 2.2 28 35% 17.70% 13586631 \n",
|
||
"3 954,806 1.8 38 83% 4.25% 1937734 \n",
|
||
"4 -98,955 2.3 30 56% 3.51% 2898047 \n",
|
||
"5 -233,379 3.6 23 35% 2.83% 4327022 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"231 NaN N.A. N.A. 10% 0.00% 3 \n",
|
||
"232 NaN N.A. N.A. 66% 0.00% 103 \n",
|
||
"233 NaN N.A. N.A. 46% 0.00% 11 \n",
|
||
"234 NaN N.A. N.A. 0% 0.00% 17 \n",
|
||
"235 NaN N.A. N.A. N.A. 0.00% 2 \n",
|
||
"\n",
|
||
"[235 rows x 12 columns]"
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"countries = pd.read_csv(\n",
|
||
" \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n",
|
||
")\n",
|
||
"\n",
|
||
"countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Кодирование"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
||
"\n",
|
||
"# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
||
"\n",
|
||
"# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"# encoded_values_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавление признаков в исходный Dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"# titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Дискретизация признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"num_bins = 3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0. , 5458956.66666667, 10917913.33333333,\n",
|
||
" 16376870. ]),\n",
|
||
" array([229, 5, 1]))"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist1, bins1 = np.histogram(\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n",
|
||
")\n",
|
||
"bins1, hist1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>(10917913.333, 16376870.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (5458956.667, 10917913.333]\n",
|
||
"2 2973190 (0.0, 5458956.667]\n",
|
||
"3 9147420 (5458956.667, 10917913.333]\n",
|
||
"4 1811570 (0.0, 5458956.667]\n",
|
||
"5 770880 (0.0, 5458956.667]\n",
|
||
"6 8358140 (5458956.667, 10917913.333]\n",
|
||
"7 910770 (0.0, 5458956.667]\n",
|
||
"8 130170 (0.0, 5458956.667]\n",
|
||
"9 16376870 (10917913.333, 16376870.0]\n",
|
||
"10 1943950 (0.0, 5458956.667]\n",
|
||
"11 364555 (0.0, 5458956.667]\n",
|
||
"12 1000000 (0.0, 5458956.667]\n",
|
||
"13 298170 (0.0, 5458956.667]\n",
|
||
"14 995450 (0.0, 5458956.667]\n",
|
||
"15 310070 (0.0, 5458956.667]\n",
|
||
"16 2267050 (0.0, 5458956.667]\n",
|
||
"17 769630 (0.0, 5458956.667]\n",
|
||
"18 1628550 (0.0, 5458956.667]\n",
|
||
"19 348560 (0.0, 5458956.667]\n",
|
||
"20 510890 (0.0, 5458956.667]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Middle\n",
|
||
"2 2973190 Small\n",
|
||
"3 9147420 Middle\n",
|
||
"4 1811570 Small\n",
|
||
"5 770880 Small\n",
|
||
"6 8358140 Middle\n",
|
||
"7 910770 Small\n",
|
||
"8 130170 Small\n",
|
||
"9 16376870 Big\n",
|
||
"10 1943950 Small\n",
|
||
"11 364555 Small\n",
|
||
"12 1000000 Small\n",
|
||
"13 298170 Small\n",
|
||
"14 995450 Small\n",
|
||
"15 310070 Small\n",
|
||
"16 2267050 Small\n",
|
||
"17 769630 Small\n",
|
||
"18 1628550 Small\n",
|
||
"19 348560 Small\n",
|
||
"20 510890 Small"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0., 4000000., 8000000., 12000000.]),\n",
|
||
" array([229, 1, 4, 1]))"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"bins2 = np.linspace(0, 12000000, 4)\n",
|
||
"\n",
|
||
"tmp_bins2 = np.digitize(\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n",
|
||
")\n",
|
||
"\n",
|
||
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
||
"\n",
|
||
"bins2, hist2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (8000000.0, 12000000.0]\n",
|
||
"2 2973190 (0.0, 4000000.0]\n",
|
||
"3 9147420 (8000000.0, 12000000.0]\n",
|
||
"4 1811570 (0.0, 4000000.0]\n",
|
||
"5 770880 (0.0, 4000000.0]\n",
|
||
"6 8358140 (8000000.0, 12000000.0]\n",
|
||
"7 910770 (0.0, 4000000.0]\n",
|
||
"8 130170 (0.0, 4000000.0]\n",
|
||
"9 16376870 NaN\n",
|
||
"10 1943950 (0.0, 4000000.0]\n",
|
||
"11 364555 (0.0, 4000000.0]\n",
|
||
"12 1000000 (0.0, 4000000.0]\n",
|
||
"13 298170 (0.0, 4000000.0]\n",
|
||
"14 995450 (0.0, 4000000.0]\n",
|
||
"15 310070 (0.0, 4000000.0]\n",
|
||
"16 2267050 (0.0, 4000000.0]\n",
|
||
"17 769630 (0.0, 4000000.0]\n",
|
||
"18 1628550 (0.0, 4000000.0]\n",
|
||
"19 348560 (0.0, 4000000.0]\n",
|
||
"20 510890 (0.0, 4000000.0]"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Big\n",
|
||
"2 2973190 Small\n",
|
||
"3 9147420 Big\n",
|
||
"4 1811570 Small\n",
|
||
"5 770880 Small\n",
|
||
"6 8358140 Big\n",
|
||
"7 910770 Small\n",
|
||
"8 130170 Small\n",
|
||
"9 16376870 NaN\n",
|
||
"10 1943950 Small\n",
|
||
"11 364555 Small\n",
|
||
"12 1000000 Small\n",
|
||
"13 298170 Small\n",
|
||
"14 995450 Small\n",
|
||
"15 310070 Small\n",
|
||
"16 2267050 Small\n",
|
||
"17 769630 Small\n",
|
||
"18 1628550 Small\n",
|
||
"19 348560 Small\n",
|
||
"20 510890 Small"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n",
|
||
" array([52, 77, 56, 44, 6]))"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
|
||
"hist3, bins3 = np.histogram(\n",
|
||
"\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"bins3, hist3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (3000000.0, inf]\n",
|
||
"2 2973190 (500000.0, 3000000.0]\n",
|
||
"3 9147420 (3000000.0, inf]\n",
|
||
"4 1811570 (500000.0, 3000000.0]\n",
|
||
"5 770880 (500000.0, 3000000.0]\n",
|
||
"6 8358140 (3000000.0, inf]\n",
|
||
"7 910770 (500000.0, 3000000.0]\n",
|
||
"8 130170 (100000.0, 500000.0]\n",
|
||
"9 16376870 (3000000.0, inf]\n",
|
||
"10 1943950 (500000.0, 3000000.0]\n",
|
||
"11 364555 (100000.0, 500000.0]\n",
|
||
"12 1000000 (500000.0, 3000000.0]\n",
|
||
"13 298170 (100000.0, 500000.0]\n",
|
||
"14 995450 (500000.0, 3000000.0]\n",
|
||
"15 310070 (100000.0, 500000.0]\n",
|
||
"16 2267050 (500000.0, 3000000.0]\n",
|
||
"17 769630 (500000.0, 3000000.0]\n",
|
||
"18 1628550 (500000.0, 3000000.0]\n",
|
||
"19 348560 (100000.0, 500000.0]\n",
|
||
"20 510890 (500000.0, 3000000.0]"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Giant\n",
|
||
"2 2973190 Big\n",
|
||
"3 9147420 Giant\n",
|
||
"4 1811570 Big\n",
|
||
"5 770880 Big\n",
|
||
"6 8358140 Giant\n",
|
||
"7 910770 Big\n",
|
||
"8 130170 Middle\n",
|
||
"9 16376870 Giant\n",
|
||
"10 1943950 Big\n",
|
||
"11 364555 Middle\n",
|
||
"12 1000000 Big\n",
|
||
"13 298170 Middle\n",
|
||
"14 995450 Big\n",
|
||
"15 310070 Middle\n",
|
||
"16 2267050 Big\n",
|
||
"17 769630 Big\n",
|
||
"18 1628550 Big\n",
|
||
"19 348560 Middle\n",
|
||
"20 510890 Big"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Квантильное разделение данных на 5 групп\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 4\n",
|
||
"2 2973190 4\n",
|
||
"3 9147420 4\n",
|
||
"4 1811570 4\n",
|
||
"5 770880 4\n",
|
||
"6 8358140 4\n",
|
||
"7 910770 4\n",
|
||
"8 130170 2\n",
|
||
"9 16376870 4\n",
|
||
"10 1943950 4\n",
|
||
"11 364555 3\n",
|
||
"12 1000000 4\n",
|
||
"13 298170 3\n",
|
||
"14 995450 4\n",
|
||
"15 310070 3\n",
|
||
"16 2267050 4\n",
|
||
"17 769630 4\n",
|
||
"18 1628550 4\n",
|
||
"19 348560 3\n",
|
||
"20 510890 3"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Giant\n",
|
||
"2 2973190 Giant\n",
|
||
"3 9147420 Giant\n",
|
||
"4 1811570 Giant\n",
|
||
"5 770880 Giant\n",
|
||
"6 8358140 Giant\n",
|
||
"7 910770 Giant\n",
|
||
"8 130170 Middle\n",
|
||
"9 16376870 Giant\n",
|
||
"10 1943950 Giant\n",
|
||
"11 364555 Big\n",
|
||
"12 1000000 Giant\n",
|
||
"13 298170 Big\n",
|
||
"14 995450 Giant\n",
|
||
"15 310070 Big\n",
|
||
"16 2267050 Giant\n",
|
||
"17 769630 Giant\n",
|
||
"18 1628550 Giant\n",
|
||
"19 348560 Big\n",
|
||
"20 510890 Big"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример конструирования признаков на основе существующих\n",
|
||
"\n",
|
||
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
|
||
"\n",
|
||
"Is_married - замужняя ли женщина\n",
|
||
"\n",
|
||
"Cabin_type - палуба (тип каюты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic_cl = titanic.drop(\n",
|
||
"# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
|
||
"# )\n",
|
||
"# titanic_cl = titanic_cl.dropna()\n",
|
||
"\n",
|
||
"# titanic_cl[\"Title\"] = [\n",
|
||
"# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
|
||
"# ]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
|
||
"\n",
|
||
"# titanic_cl"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
||
"\n",
|
||
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\Методы искусственного интеллекта\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1379: SyntaxWarning: invalid escape sequence '\\l'\n",
|
||
" columns_string = \"\\l\".join(column_typing_info) # noqa: W605\n",
|
||
"d:\\Методы искусственного интеллекта\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1381: SyntaxWarning: invalid escape sequence '\\l'\n",
|
||
" label = \"{%s (%d row%s)|%s\\l}\" % ( # noqa: W605\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"( no Country Population2020 Yearly NetChange Density \\\n",
|
||
" 0 1 China 1439323776 0.39 5540090 153 \n",
|
||
" 1 2 India 1380004385 0.99 13586631 464 \n",
|
||
" 2 3 United States 331002651 0.59 1937734 36 \n",
|
||
" 3 4 Indonesia 273523615 1.07 2898047 151 \n",
|
||
" 4 5 Pakistan 220892340 2.00 4327022 287 \n",
|
||
" .. ... ... ... ... ... ... \n",
|
||
" 230 231 Montserrat 4992 0.06 3 50 \n",
|
||
" 231 232 Falkland Islands 3480 3.05 103 0 \n",
|
||
" 232 233 Niue 1626 0.68 11 6 \n",
|
||
" 233 234 Tokelau 1357 1.27 17 136 \n",
|
||
" 234 235 Holy See 801 0.25 2 2,003 \n",
|
||
" \n",
|
||
" LandArea \n",
|
||
" 0 9388211 \n",
|
||
" 1 2973190 \n",
|
||
" 2 9147420 \n",
|
||
" 3 1811570 \n",
|
||
" 4 770880 \n",
|
||
" .. ... \n",
|
||
" 230 100 \n",
|
||
" 231 12170 \n",
|
||
" 232 260 \n",
|
||
" 233 10 \n",
|
||
" 234 0 \n",
|
||
" \n",
|
||
" [235 rows x 7 columns],\n",
|
||
" Year Population YearlyPer Yearly Median Fertility Density\n",
|
||
" 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
|
||
" 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
|
||
" 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
|
||
" 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
|
||
" 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
|
||
" 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
|
||
" 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
|
||
" Country Capital Continent\n",
|
||
" 0 Afghanistan Kabul Asia\n",
|
||
" 1 Albania Tirana Europe\n",
|
||
" 2 Algeria Algiers Africa\n",
|
||
" 3 American Samoa Pago Pago Oceania\n",
|
||
" 4 Andorra Andorra la Vella Europe\n",
|
||
" .. ... ... ...\n",
|
||
" 229 Wallis and Futuna Mata-Utu Oceania\n",
|
||
" 230 Western Sahara El Aai?�n Africa\n",
|
||
" 231 Yemen Sanaa Asia\n",
|
||
" 232 Zambia Lusaka Africa\n",
|
||
" 233 Zimbabwe Harare Africa\n",
|
||
" \n",
|
||
" [234 rows x 3 columns])"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"from woodwork.logical_types import Categorical, Datetime\n",
|
||
"\n",
|
||
"info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n",
|
||
"forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n",
|
||
"capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n",
|
||
"forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n",
|
||
"info[\"Population2020\"] = info[\"Population2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"Yearly\"] = info[\"Yearly\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"info[\"NetChange\"] = info[\"NetChange\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"LandArea\"] = info[\"LandArea\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"\n",
|
||
"info, forcast, capitals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Создание сущностей в featuretools\n",
|
||
"\n",
|
||
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\Методы искусственного интеллекта\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"d:\\Методы искусственного интеллекта\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" No relationships"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = ft.EntitySet(id=\"countries\")\n",
|
||
"\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"countries\",\n",
|
||
" dataframe=info,\n",
|
||
" index=\"no\",\n",
|
||
" logical_types={\n",
|
||
" \"Country\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"capitals\",\n",
|
||
" dataframe=capitals,\n",
|
||
" index=\"Country\",\n",
|
||
" logical_types={\n",
|
||
" \"Country\": Categorical,\n",
|
||
" \"Capital\": Categorical,\n",
|
||
" \"Continent\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"forcast\",\n",
|
||
" dataframe=forcast,\n",
|
||
" index=\"forcast_id\",\n",
|
||
" make_index=True,\n",
|
||
" logical_types={\n",
|
||
" \"Year\": Datetime,\n",
|
||
" },\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Настройка связей между сущностями featuretools\n",
|
||
"\n",
|
||
"Настройка связей между таблицами на уровне ключей\n",
|
||
"\n",
|
||
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" countries.Country -> capitals.Country"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
||
"\n",
|
||
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
||
"\n",
|
||
"Результат помещается в Dataframe feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>Yearly</th>\n",
|
||
" <th>NetChange</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>capitals.Capital</th>\n",
|
||
" <th>capitals.Continent</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Beijing</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>New Delhi</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Washington, D.C.</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Jakarta</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Islamabad</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>Brades</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>Stanley</td>\n",
|
||
" <td>South America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>Alofi</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Nukunonu</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 Yearly NetChange LandArea \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5540090 9388211 \n",
|
||
"2 India 1380004385 0.99 13586631 2973190 \n",
|
||
"3 United States 331002651 0.59 1937734 9147420 \n",
|
||
"4 Indonesia 273523615 1.07 2898047 1811570 \n",
|
||
"5 Pakistan 220892340 2.00 4327022 770880 \n",
|
||
".. ... ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 100 \n",
|
||
"232 Falkland Islands 3480 3.05 103 12170 \n",
|
||
"233 Niue 1626 0.68 11 260 \n",
|
||
"234 Tokelau 1357 1.27 17 10 \n",
|
||
"235 Holy See 801 0.25 2 0 \n",
|
||
"\n",
|
||
" capitals.Capital capitals.Continent \n",
|
||
"no \n",
|
||
"1 Beijing Asia \n",
|
||
"2 New Delhi Asia \n",
|
||
"3 Washington, D.C. North America \n",
|
||
"4 Jakarta Asia \n",
|
||
"5 Islamabad Asia \n",
|
||
".. ... ... \n",
|
||
"231 Brades North America \n",
|
||
"232 Stanley South America \n",
|
||
"233 Alofi Oceania \n",
|
||
"234 Nukunonu Oceania \n",
|
||
"235 NaN NaN \n",
|
||
"\n",
|
||
"[235 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name=\"countries\",\n",
|
||
" max_depth=1,\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Полученные признаки\n",
|
||
"\n",
|
||
"Список колонок полученного dataframe'а"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[<Feature: Country>,\n",
|
||
" <Feature: Population2020>,\n",
|
||
" <Feature: Yearly>,\n",
|
||
" <Feature: NetChange>,\n",
|
||
" <Feature: LandArea>,\n",
|
||
" <Feature: capitals.Capital>,\n",
|
||
" <Feature: capitals.Continent>]"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_defs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Отсечение значений признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определение выбросов с помощью boxplot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGsCAYAAAAPJKchAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAKwlJREFUeJzt3Qt0VNXZ//EnISEXkJuBBDAKiIosERBqCBUBCYmAtOiypWANpZX+vXUhgVZBCY0XUG5iNZRCC0pbAUVACxhCo5GqaSkgCpWLSCA2kJDYFwIJhkDmv57dd+bNhARnQpjNzHw/ax0n58w+M2dwzcxv9jXE4XA4BAAAwJJQW08MAACgCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKr8KI1u2bJGRI0dKhw4dJCQkRNatW+f1Y7zxxhvSq1cviY6OlmuuuUbmzJlzSa4VAAAEYBgpLy+Xnj17SmZmZoPOf/fdd+W+++6TBx98UHbv3i0LFy6UF198UV555ZVGv1YAAOCZEH9dKE9rRtauXSujRo1yHausrJQnn3xSVqxYIcePH5ebbrpJXnjhBRk0aJC5f+zYsVJVVSVvvvmm65yXX35ZZs+eLQUFBeYxAQCAb/lVzci3efTRRyUvL09Wrlwpn332mfzgBz+QO++8U7744gtXWImMjHQ7JyoqSv7973/L4cOHLV01AADBLWDCiNZsLFu2zNR6DBgwQK699lqZMmWK3Hbbbea4SklJkTVr1khOTo5UV1fL/v37Zd68eea+o0ePWn4FAAAEpzAJELt27ZJz587J9ddf73Zca0OuvPJK8/eECRPkyy+/lLvuuss017Ro0UImTpwov/71ryU0NGByGQAAfiVgwsipU6ekSZMmsn37dnNbU/Pmzc2t9gnRPiQzZ86UoqIiadu2raklUV26dLFy3QAABLuACSO9e/c2NSPHjh0zzTQXomGlY8eO5m/t7JqYmGiCCQAA8L0wf6v9OHDggGs/Pz9fdu7cKW3atDHNMzpsNzU11fQD0XBSUlJiaj5uvvlmGTFihJSWlsrq1avN6JpvvvnG1cfkgw8+sPq6AAAIZn41tDc3N1cGDx583vFx48bJq6++avqBPPvss7J8+XIpLCyUmJgY6devn2RkZEiPHj1MGNFJ07R/ib5srRF57rnnJCEhwcrrAQAAfhZGAABA4GEICQAAsIowAgAArPKLDqw6QdmRI0fkiiuuYMp2AAD8hPYEOXnypFng9kLzeflFGNEgEh8fb/syAABAA3z11Vdy1VVX+XcY0RoR54vRWVMBBA4dBZednS3JyckSHh5u+3IANKKysjJTmeD8HvfrMOJsmtEgQhgBAi+MREdHm/c2YQQITN/WxYIOrAAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMArDl37px88MEHsmXLFnOr+wCCD2EEgBVr1qyRrl27ytChQ2X+/PnmVvf1OIDgQhgB4HMaOO69917p0aOH/O1vf5MVK1aYW93X4wQSILiEOHR9Xz9YaKdly5Zy4sQJ1qYB/Jw2xWgNiAaPdevWmf2NGzfK8OHDpUmTJjJq1CjZvXu3fPHFF2YfgP/y9PubmhEAPqU1IIcOHZJp06ZJaKj7R5DuT506VfLz8005AMGBMALAp44ePWpub7rppjrvdx53lgMQ+AgjAHyqffv25labYuriPO4sByDwEUYA+NSAAQOkU6dOMnPmTKmurna7T/dnzZolnTt3NuUABAfCCACf0k6p8+bNk/Xr15vOqn//+9/l9OnT5lb39fjcuXPpvAoEkTDbFwAg+Nxzzz2yevVqmTx5stx+++2u41ojosf1fgDBg6G9AKzRYb3vv/++vPvuuzJs2DAZPHgwNSJAALlkQ3t12uaRI0dKhw4dJCQkxMwT4KmPPvpIwsLCpFevXt4+LYAApMFj4MCBpnZEbwkiQHDyOoyUl5dLz549JTMz06vzjh8/LqmpqTJkyBBvnxIAAAQwr/uMaFWqbt568MEHZezYseaXjze1KQAAILD5pAPrsmXL5ODBg/KnP/1Jnn322W8tX1lZabaabU6qqqrKbAACp89Ibm6uaf6NiIiQQYMG0VQDBBBPv7MveRjR9SWeeOIJM7Wz9hfxhM4zkJGRcd7x7OxsiY6OvgRXCcDX8vLyzA+VY8eOmX1dubddu3Yyfvx4SUxMtH15ABpBRUWF/TCiv3q0aUaDxfXXX+/xebo2RVpamlvNSHx8vCQnJzOaBggAa9euldmzZ5vF8aZMmSJFRUUSFxdn5hfR4ytXrpS7777b9mUCuEjOlo1LOrRXR9Poh4pOVFRfp9XWrVu7VbvqDIv6lHpMazruuOOOb30ehvYCgYNVe4HgUebh9/clrRnRJ961a5fbsYULF8p7771nJjbSCY4ABOeqvStWrDCr9GoYqb1qb//+/U057UMCIPB5HUZOnTolBw4ccO3rUt87d+6UNm3ayNVXX20+SAoLC2X58uXmg6X2ypzaJhwZGVnvip0AAhur9gK46HlGtm3bJr179zab0r4d+nd6errrA6SgoMDbhwUQJFi1F0BtTAcPwKfoMwIEj7JLNR08AFwMVu0FUBur9gLwOVbtBVATzTQArGHVXiCwXRZDewHAk1V7dQFOVu0Fghd9RgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACAVYQRAABgFWEEAABYRRgBAABWEUYAAIBVhBEAAGAVYQQAAFhFGAEAAFYRRgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACAVYQRAABgFWEEAABYRRgBAABWEUYAAIBVhBEAAGAVYQQAAFhFGAEAAFYRRgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACAVYQRAABgFWEEAAD4VxjZsmWLjBw5Ujp06CAhISGybt26C5Zfs2aNDB06VNq2bSstWrSQxMRE2bRp08VcMwAACOYwUl5eLj179pTMzEyPw4uGkY0bN8r27dtl8ODBJsx88sknDbleAAAQYMK8PWHYsGFm89SCBQvc9mfOnClvv/22/OUvf5HevXt7+/QAACDYw8jFqq6ulpMnT0qbNm3qLVNZWWk2p7KyMnNbVVVlNgCBw/me5r0NBB5P39c+DyNz586VU6dOyQ9/+MN6y8yaNUsyMjLOO56dnS3R0dGX+AoB2LB582bblwCgkVVUVHhULsThcDga+iTagXXt2rUyatQoj8q//vrrMmHCBNNMk5SU5FXNSHx8vJSWlppOsAAC65eTBhHtWxYeHm77cgA0Iv3+jomJkRMnTlzw+9tnNSMrV66UBx54QN58880LBhEVERFhttr0g4oPKyAw8f4GAo+n72mfzDOyYsUKGT9+vLkdMWKEL54SAAD4Ca9rRrS/x4EDB1z7+fn5snPnTtMh9eqrr5apU6dKYWGhLF++3NU0M27cOHnppZckISFBioqKzPGoqChp2bJlY74WAADgh7yuGdm2bZsZkusclpuWlmb+Tk9PN/tHjx6VgoICV/nFixfL2bNn5ZFHHpH27du7tokTJzbm6wAAAMFSMzJo0CC5UJ/XV1991W0/Nze3YVcGAACCAmvTAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAA/CuMbNmyRUaOHCkdOnSQkJAQWbdu3beek5ubK7fccotERERI165d5dVXX23o9QIAgGAPI+Xl5dKzZ0/JzMz0qHx+fr6MGDFCBg8eLDt37pTHHntMHnjgAdm0aVNDrhcAAASYMG9PGDZsmNk8tWjRIuncubPMmzfP7N94443y4YcfyosvvigpKSnePj0AAAj2MOKtvLw8SUpKcjumIURrSOpTWVlpNqeysjJzW1VVZTYAgcP5nua9DQQeT9/XlzyMFBUVSWxsrNsx3deAcfr0aYmKijrvnFmzZklGRsZ5x7OzsyU6OvqSXi8AOzZv3mz7EgA0soqKissjjDTE1KlTJS0tzbWvwSU+Pl6Sk5OlRYsWVq8NQOP/ctIgMnToUAkPD7d9OQAakbNlw3oYiYuLk+LiYrdjuq+hoq5aEaWjbnSrTT+o+LACAhPvbyDwePqevuTzjCQmJkpOTo7bMf0VpMcBAAC8DiOnTp0yQ3R1cw7d1b8LCgpcTSypqamu8g8++KAcPHhQfvWrX8nevXtl4cKF8sYbb8ikSZMa83UAAIBgCSPbtm2T3r17m01p3w79Oz093ewfPXrUFUyUDuvdsGGDqQ3R+Ul0iO/vf/97hvUCAICG9RkZNGiQOByOeu+va3ZVPeeTTz7x9qkAAEAQYG0aAABgFWEEAABYRRgBAABWEUYAAIBVhBEAAGAVYQQAAFhFGAEAAFYRRgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACAVYQRAABgFWEEAABYRRgBAABWEUYAAIBVhBEAAGAVYQQAAFhFGAEAAFYRRgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACAVYQRAABgFWEEAABYRRgBAABWEUYAAIBVhBEAAGAVYQQAAFhFGAEAAFYRRgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACA/4WRzMxM6dSpk0RGRkpCQoJs3br1guUXLFggN9xwg0RFRUl8fLxMmjRJvvnmm4ZeMwAACOYwsmrVKklLS5MZM2bIjh07pGfPnpKSkiLHjh2rs/zrr78uTzzxhCm/Z88e+cMf/mAeY9q0aY1x/QAAINjCyPz582XChAkyfvx46d69uyxatEiio6Nl6dKldZb/+OOP5bvf/a6MHTvW1KYkJyfLmDFjvrU2BQAABIcwbwqfOXNGtm/fLlOnTnUdCw0NlaSkJMnLy6vznP79+8uf/vQnEz5uvfVWOXjwoGzcuFHuv//+ep+nsrLSbE5lZWXmtqqqymwAAofzPc17Gwg8nr6vvQojpaWlcu7cOYmNjXU7rvt79+6t8xytEdHzbrvtNnE4HHL27Fl58MEHL9hMM2vWLMnIyDjveHZ2tqmFARB4Nm/ebPsSADSyioqKxg8jDZGbmyszZ86UhQsXms6uBw4ckIkTJ8ozzzwj06dPr/McrXnRfik1a0a046s28bRo0eJSXzIAH/9y0iAydOhQCQ8Pt305ABqRs2WjUcNITEyMNGnSRIqLi92O635cXFyd52jg0CaZBx54wOz36NFDysvL5ec//7k8+eSTppmntoiICLPVph9UfFgBgYn3NxB4PH1Pe9WBtWnTptKnTx/JyclxHauurjb7iYmJ9VbR1A4cGmiUNtsAAIDg5nUzjTafjBs3Tvr27Ws6pOocIlrToaNrVGpqqnTs2NH0+1AjR440I3B69+7taqbR2hI97gwlAAAgeHkdRkaPHi0lJSWSnp4uRUVF0qtXL8nKynJ1ai0oKHCrCXnqqackJCTE3BYWFkrbtm1NEHnuueca95UAAAC/FOLwg7YS7QDTsmVLOXHiBB1YgQDswKrD/YcPH06fESDAePr9zdo0AADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAA/C+MZGZmSqdOnSQyMlISEhJk69atFyx//PhxeeSRR6R9+/YSEREh119/vWzcuLGh1wwAAAJImLcnrFq1StLS0mTRokUmiCxYsEBSUlJk37590q5du/PKnzlzRoYOHWruW716tXTs2FEOHz4srVq1aqzXAAAAgimMzJ8/XyZMmCDjx483+xpKNmzYIEuXLpUnnnjivPJ6/D//+Y98/PHHEh4ebo5prQoAAIDXYURrObZv3y5Tp051HQsNDZWkpCTJy8ur85x33nlHEhMTTTPN22+/LW3btpWxY8fK448/Lk2aNKnznMrKSrM5lZWVmduqqiqzAQgczvc0720g8Hj6vvYqjJSWlsq5c+ckNjbW7bju7927t85zDh48KO+9957cd999pp/IgQMH5OGHHzYXOGPGjDrPmTVrlmRkZJx3PDs7W6Kjo725ZAB+YvPmzbYvAUAjq6iouDTNNN6qrq42/UUWL15sakL69OkjhYWFMmfOnHrDiNa8aL+UmjUj8fHxkpycLC1atLjUlwzAh/SHiQYR7VvmbMoFEBicLRuNGkZiYmJMoCguLnY7rvtxcXF1nqMjaPQDpmaTzI033ihFRUWm2adp06bnnaMjbnSrTR+HDysgMPH+BgKPp+9pr4b2anDQmo2cnBy3mg/d134hdfnud79rmma0nNP+/ftNSKkriAAAgODi9Twj2nyyZMkSee2112TPnj3y0EMPSXl5uWt0TWpqqlsHV71fR9NMnDjRhBAdeTNz5kzToRUAAMDrPiOjR4+WkpISSU9PN00tvXr1kqysLFen1oKCAjPCxkn7emzatEkmTZokN998s5lnRIOJjqYBAAAIcTgcDvGDDjAtW7aUEydO0IEVCMAOrDrSbvjw4fQZAQKMp9/frE0DAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAACwijACAACsIowAAACrCCMAAMAqwggAALCKMAIAAKwijAAAAKsIIwAAwCrCCAAAsIowAgAArCKMAAAAqwgjAADAKsIIAADwvzCSmZkpnTp1ksjISElISJCtW7d6dN7KlSslJCRERo0a1ZCnBQAAAcjrMLJq1SpJS0uTGTNmyI4dO6Rnz56SkpIix44du+B5hw4dkilTpsiAAQMu5noBAECwh5H58+fLhAkTZPz48dK9e3dZtGiRREdHy9KlS+s959y5c3LfffdJRkaGdOnS5WKvGQAABJAwbwqfOXNGtm/fLlOnTnUdCw0NlaSkJMnLy6v3vKefflratWsnP/vZz+Rvf/vbtz5PZWWl2ZzKysrMbVVVldkABA7ne5r3NhB4PH1fexVGSktLTS1HbGys23Hd37t3b53nfPjhh/KHP/xBdu7c6fHzzJo1y9Si1JadnW1qYQAEns2bN9u+BACNrKKiovHDiLdOnjwp999/vyxZskRiYmI8Pk9rXrRfSs2akfj4eElOTpYWLVpcoqsFYOuXkwaRoUOHSnh4uO3LAdCInC0bjRpGNFA0adJEiouL3Y7rflxc3Hnlv/zyS9NxdeTIka5j1dXV/33isDDZt2+fXHvtteedFxERYbba9IOKDysgMPH+BgKPp+9przqwNm3aVPr06SM5OTlu4UL3ExMTzyvfrVs32bVrl2micW7f+973ZPDgweZvre0AAADBzetmGm0+GTdunPTt21duvfVWWbBggZSXl5vRNSo1NVU6duxo+n3oPCQ33XST2/mtWrUyt7WPAwCA4OR1GBk9erSUlJRIenq6FBUVSa9evSQrK8vVqbWgoMCMsAEAAPBEiMPhcIgfdIBp2bKlnDhxgg6sQAB2YN24caMMHz6cPiNAgPH0+5sqDAAAYBVhBAAAWEUYAQAAVhFGAACAVYQRANbo8hIffPCBbNmyxdzqPoDgQxgBYMWaNWuka9euZhp4XQ1cb3VfjwMILoQRAD6ngePee++VHj16mJW8V6xYYW51X48TSIDgwjwjAHxKm2K0BkSDx7p168y+c54RXftq1KhRsnv3bvniiy/MPgD/xTwjAC5LWgOiC2hOmzbtvNmadV9X7c7PzzflAAQHwggAnzp69OgF16dyHneWAxD4CCMAfKp9+/bmVpti6uI87iwHIPARRgD41IABA6RTp04yc+ZMqa6udrtP93XF786dO5tyAIIDYQSAT2mn1Hnz5sn69etNZ9W///3vcvr0aXOr+3p87ty5dF4FgkiY7QsAEHzuueceWb16tUyePFluv/1213GtEdHjej+A4MHQXgDW6LDe999/X959910ZNmyYDB48mBoRIIB4+v1NzQgAazR4DBw4UMrLy80tQQQITvQZAQAAVhFGAACAVYQRANawai8ARRgBYAWr9gJwIowA8DlW7QVQE0N7AfgUq/YCwaOMVXsBXI5YtRdAbYQRAD7Fqr0AaiOMAPApVu0FUBthBIBPsWovgNoIIwB8ilV7AdTG2jQAfI5VewHUxNBeANawai8Q2Fi1F8Blj1V7ASj6jACwhrVpACjCCAArWJsGgBNhBIC1tWl0grPf/OY38uijj5pb3WdtGiD40IEVgJW1aWJiYqSkpEQOHz7suu+aa66Rtm3bytdff83aNEAAoAMrgMt6bRrdoqKi3O47duyYK5xouUGDBlm6SgC+RDMNAJ8qLCx0/T1kyBATOlasWGFudb+ucgACW4PCSGZmppnOOTIyUhISEmTr1q31ll2yZImZ1rl169ZmS0pKumB5AIGtuLjY3Pbs2VPefvtt8xmiNSR6q/s333yzWzkAgc/rMLJq1SpJS0uTGTNmyI4dO8wHSkpKiqlerUtubq6MGTPGTGyUl5cn8fHxkpyczK8eIEhpfxBVu4nGKTo62q0cgMDndRjRIXgTJkyQ8ePHS/fu3WXRokXmw2Pp0qV1lv/zn/8sDz/8sPTq1Uu6desmv//9781iWDk5OY1x/QD8TGjofz92nGvR1F6bRm9rlgMQ+LzqwHrmzBnZvn27TJ061XVMPzC06UVrPTxRUVEhVVVV0qZNm3rLVFZWmq1mb1yl5+kGwH85V+O94YYb5LPPPnNbm0abf/X4vn37TDne74B/8/Q97FUYKS0tNcPyYmNj3Y7r/t69ez16jMcff1w6dOhgAkx9dAnxjIyM845nZ2e7qnAB+Cf9DNGhfho4+vTpY5ptIyIizA8QbfrVHzx6/6lTp2Tjxo22LxfARdAKCE/4dGjv888/LytXrjT9SLTza3205kX7pdSsGXH2NWGeEcD/LV68WH70ox/J559/bsKHk/7YCAkJMfePHDnS6jUCuHjOlo1GDSM6SZFOQlS7l7vux8XFXfDcuXPnmjDy17/+1dVbvj76K0m32sLDw80GwL/98Ic/lLCwMJk8ebKZb6RmLat+Vtxzzz1Wrw9A4/D0O9urHmJNmzY11ao1O586O6MmJibWe97s2bPlmWeekaysLOnbt683TwkgQGng0KYaDR/Dhw83t9rcSxABgo/X3dW1+UTnDnnttddkz5498tBDD5nlv3V0jUpNTXXr4PrCCy/I9OnTzWgb7ZxWVFRkNm0PBhC8dP0Z7aw6ZcoU0zdEb3WfdWmA4ON1GBk9erT5BZOenm6G6+7cudPUeDg7tRYUFMjRo0dd5X/729+aUTi6+FX79u1dmz4GgOBeKK9Hjx5uM7DqPgvlAcGHhfIAWFkoT4PHunXrzL7WjGhTjfZJ07lGdu/ezUJ5QADw9PubWYUAWFkob9q0aaK/hT744APZsmWLudV9bebNz8835QAEB8IIAJ9yNuN++eWXpoZk6NChZmZnvdX9gwcPupUDEPgIIwB8SvuMqfvvv7/OPiN6vGY5AIGPPiMAfEo7tDdr1kyuvPJK+fe//22aZpx9RnTCs6uuusoskqej9HQ6AQD+y9Pvb5/OwAoAH3/8sZw9e9ZMlnj33Xeb5hntrHr48GHZvHmza1JFLTdo0CDblwvABwgjAHzK2Rdk4sSJ8sorr8j69etd9+noGT3+0ksv0WcECCL0GQHgU86+IBo4ajfD6L4er1kOQOAjjADwqf79+0to6H8/eoYMGeLWgVX3ld6v5QAEB8IIAJ/S0KFrWintvLpjxw756KOPzK2zP73ezzwjQPCgzwgAn8rNzXWt3PvWW2/Jhg0b3PqM6PE33njDlHPWlAAIbNSMALBCA0ddfUb0OIDgQs0IAJ8aMGCA6+877rhDUlJSZP/+/XL99dfLpk2bXDUlNcsBCGyEEQDW5OTkuDXTREZGWr0eAHbQTAPAp2p2TP3mm2/c7qu5TwdWIHgQRgD4lHMkTV01ITX3a5YDENgIIwB8qlWrVq7gERMT43af7jsDibMcgMBHnxEAPnX8+HFXk4wulFdTzX1nOQCBj5oRAABgFWEEgE81b97c9XfHjh3d7rvqqqvqLAcgsBFGAPjUe++95/q7tLTU7b6SkpI6ywEIbIQRAD5Vsy9IZWWl23019+kzAgQPwggAn7rlllsatRwA/0cYAeBTd911V6OWA+D/CCMAfGrlypWNWg6A/yOMAPCp/Pz8Ri0HwP8x6RkAnzp9+rTr77Zt20r37t3NKBr9+/PPP3eNqKlZDkBgI4wA8KmaI2b69u0rU6dOlcLCQjPnyKxZs+Tdd989rxyAwEYYAeBTNWs8srKyXOFDhYb+X8sxNSNA8CCMAPBaRUWF7N27t0HntmjRwvW3w+Fwu6/mSr1abseOHV4/frdu3SQ6OrpB1wbADsIIAK9pEOnTp88lfY7PPvusQc+xfft25igB/AxhBECDah/0S78hzpw5I/379z+vVqSmkJAQ+fjjj6Vp06YNujYA/oUwAsBr2gxyMbUPU6ZMkTlz5lzw/n79+jX48QH4F8IIAJ+bPXu2uZ0/f76cO3fOdTwsLEwmTZrkuh9AcAhxXKiu9DJRVlYmLVu2lBMnTrh1fgPg37TJ5slnZ8vvNvxD/t+IBHnuqV81qGkGgH9/f1MzAgSZ/NJyKa88K5eLft9LlTfP9JZ+3+sh+0t0OO/lMaS3WUSYdI5pZvsygKDQoDCSmZlp2nuLioqkZ8+e8vLLL8utt95ab/k333xTpk+fLocOHZLrrrtOXnjhBRk+fPjFXDeABgaRwXNz5XI0efUuudy8P2UQgQS4HMPIqlWrJC0tTRYtWiQJCQmyYMECSUlJkX379km7du3OK6894seMGWNmVtRVOF9//XUZNWqUmT/gpptuaqzXAcADzhqRBaN7Sdd2zeVyUH66Utbn5sldgxKlWVSEXA4OHDslj63aeVnVIAGBzOswoh3OJkyYIOPHjzf7Gko2bNggS5culSeeeOK88i+99JLceeed8stf/tLsP/PMM7J582Z55ZVXzLkAfCskrEyaRBZKaOTlEUaiws5Kh9ZHJOqKIgkNuzxajptEnjL/TgB8I8zbzmY6t4CuJVFz+uakpCTJy8ur8xw9rjUpNWlNyrp16+p9Hl2Toua6FNoBRlVVVZkNQMOcPF0p4a3+IdO2zpTLzcKshXI5CW81RM6eTeYzB7gInr5/vAojpaWlZhhebGys23Hdr29qaO1XUld5PV4fbdLJyMg473h2djbTPAMXIa84RKqOJ8jZU90v6nHOlH4lX6+fK5ejK++aIk1j4i/6cRxnr5B/5n0oh6Ma5bKAoF06whOXR51oLVrzUrM2RWtG4uPjJTk5maG9wEXoV35Geuw5Jl3aNpOo8CYNfpzTpyvk0LDBjXJN586ek127dkmPHj2kSVjDr8mp07XXSVTUxf9oaRbRRDpdSedV4GI4WzYaNYzExMRIkyZNpLi42O247sfFxdV5jh73pryKiIgwW23h4eFmA9Awsa3C5b7Ezo3wSFdKYreLr31wVuNeIRUyfPgg3t9AgPH0Pf1/63V7QCcj0oWrcnJy3FbZ1P3ExMQ6z9HjNcsr7cBaX3kAABBcvG6m0eaTcePGSd++fc3cIjq0t7y83DW6JjU1VTp27Gj6faiJEyfKwIEDZd68eTJixAhZuXKlbNu2TRYvXtz4rwYAAAR+GBk9erSUlJRIenq66YTaq1cvycrKcnVSLSgoMCNsnHR1Tp1b5KmnnpJp06aZSc90JA1zjAAAAMXaNACs0j4jGzduNLMy02cECCyefn971WcEAACgsRFGAACAVYQRAABgFWEEAABYRRgBAABWEUYAAIBVhBEAAGAVYQQAAFhFGAEAAP41HbwNzkliPV2KGIB/zcBaUVFh3t/MwAoEFuf39rdN9u4XYeTkyZPmNj6+cZYsBwAAvv0e12nh/Xptmurqajly5IhcccUVEhISYvtyADTyLyf9ofHVV1+x9hQQYDRiaBDp0KGD2yK6fhlGAAQuFsIEQAdWAABgFWEEAABYRRgBYFVERITMmDHD3AIITvQZAQAAVlEzAgAArCKMAAAAqwgjAADAKsIIAK8NGjRIHnvsscvmcQD4N8II4Gd+8pOfmJmIdWvatKl07dpVnn76aTl79qxcrnJzc831Hj9+3O34mjVr5JlnnmnU51qyZIkMGDBAWrdubbakpCTZunWrWxntt5+eni7t27eXqKgoU+aLL75w3X/o0CH52c9+Jp07dzb3X3vttWbEz5kzZ9we57PPPjPPFRkZaWaRnT17dqO+FiBYEEYAP3TnnXfK0aNHzRfo5MmT5de//rXMmTNH/E2bNm3MMg+NHXzGjBkj77//vuTl5ZmQkJycLIWFha4yGhp+85vfyKJFi+Qf//iHNGvWTFJSUuSbb74x9+/du9csQ/G73/1O/vWvf8mLL75oyk6bNs1t5lh93GuuuUa2b99u/v31/8PixYsb9fUAQUGH9gLwH+PGjXN8//vfdzs2dOhQR79+/Rz/+c9/HPfff7+jVatWjqioKMedd97p2L9/v6vcsmXLHC1btnSsXbvW0bVrV0dERIQjOTnZUVBQcMHHnzhxomPgwIGuff1bjzktX77c0adPH0fz5s0dsbGxjjFjxjiKi4vNffn5+Tp9gNumz1HX43h6/VlZWY5u3bo5mjVr5khJSXEcOXKk3n+vs2fPOq644grHa6+9Zvarq6sdcXFxjjlz5rjKHD9+3PxbrFixot7HmT17tqNz586u/YULFzpat27tqKysdB17/PHHHTfccEO9jwGgbtSMAAFAmxK0CUGbcLZt2ybvvPOOqRXQ5ojhw4dLVVWVq2xFRYU899xzsnz5cvnoo49M08mPfvSji3p+fXxtbvn0009l3bp1pplDr0VpzcRbb71l/t63b5+p0XnppZfqfBxPr3/u3Lnyxz/+UbZs2SIFBQUyZcqUeq9Ny+v5Wguj8vPzpaioyDTNOOnaOAkJCeY566Nr5zgfQ2nZ22+/3TSVOWntir7G//mf//HwXw6ACuOfAfBf+mWdk5MjmzZtkmHDhpkgoAGjf//+5v4///nPJgzo8R/84AfmmH4xv/LKK+bLV7322mty4403mn4Vt956a4Ou46c//anr7y5dupgmkO985zty6tQpad68uetLvF27dtKqVas6H0ObnDSEeHL92mSi/TjUo48+avrM1Ofxxx83K4Y6w4cGERUbG+tWTved99V24MABefnll00IctKy2qek9mM479P+KgA8Q80I4IfWr19vvuS146SGkNGjR5tahbCwMFfIUFdeeaXccMMNsmfPHtcxLaNBwalbt24mINQs4y3tMzFy5Ei5+uqrTR+QgQMHmuNaa+EpfX5Prj86OtoVRJR2Qj127Fidj/n888/LypUrZe3atebfqiG0r4n20dEwNGHChAY9BoALI4wAfmjw4MGyc+dOU5tw+vRpU7uho1UaQ2hoqKlxqalmM0lt5eXlpnmiRYsWpibjn//8p/nyV7VHnzSG8PBwt3193XWtaqG1GBpGsrOz5eabb3Ydj4uLM7fFxcVu5XXfeZ/TkSNHzL+11tTU7piqZet6jJrPAcAzhBHAD+noDx3SqzURWpugtKlFh/fq6BCnr7/+2vRh6N69u+uYltF+GU56v/Yb0fNV27ZtTb+OmjT41EdHnujz6Be/DnPVmpbaNRXOfhXnzp2r93E8vX5P6GgZ7cOSlZUlffv2dbtPm1Y0LGjzVs2RMfq8iYmJbjUiOg9Knz59ZNmyZSak1aRltc9KzaC2efNmU5NDEw3gHcIIECCuu+46+f73v2+aEj788EPTmfTHP/6xdOzY0RyvWbPwi1/8wnz5avOKNu/069fP1V/kjjvuMGFFO7hqzYvOr7F79+56n1cDkYYN7VNx8OBB0++j9twhOvxVazC0eamkpMT0JWno9X+bF154QaZPny5Lly6VTp06mf4bujmfU69DJ1p79tlnzbXu2rVLUlNTTb+SUaNGuQURfW1aw6LX7Hwcp7Fjx5rXrfOR6PDfVatWmY65aWlpHl8rgP9VzygbAJepuobe1h4aq8NfdWisDnuta2jsW2+95ejSpYsZzpqUlOQ4fPiw2+Okp6ebIbpadtKkSY5HH330gkN7X3/9dUenTp3M4yUmJjreeecdM4T3k08+cZV5+umnzZDakJCQbx3a+23XX5MOU675UXbNNdecN5RYtxkzZrjK6PDe6dOnm9eo1zxkyBDHvn373J6nrseo/ZH56aefOm677TbzGB07dnQ8//zzF/g/B6A+IfofZzABENheffVVUytQeyZUALCJZhoAAGAVYQQAAFhFMw0AALCKmhEAAGAVYQQAAFhFGAEAAFYRRgAAgFWEEQAAYBVhBAAAWEUYAQAAVhFGAACAVYQRAAAgNv1/A3dqPyX7ds8AAAAASUVORK5CYII=",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries.boxplot(column=\"Population2020\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отсечение данных для признака Население, значение которых больше 50000000\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationClip</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationClip\n",
|
||
"no \n",
|
||
"1 China 1439323776 50000000\n",
|
||
"2 India 1380004385 50000000\n",
|
||
"3 United States 331002651 50000000\n",
|
||
"4 Indonesia 273523615 50000000\n",
|
||
"5 Pakistan 220892340 50000000\n",
|
||
"6 Brazil 212559417 50000000\n",
|
||
"7 Nigeria 206139589 50000000\n",
|
||
"8 Bangladesh 164689383 50000000\n",
|
||
"9 Russia 145934462 50000000\n",
|
||
"10 Mexico 128932753 50000000\n",
|
||
"11 Japan 126476461 50000000\n",
|
||
"12 Ethiopia 114963588 50000000\n",
|
||
"13 Philippines 109581078 50000000\n",
|
||
"14 Egypt 102334404 50000000\n",
|
||
"15 Vietnam 97338579 50000000\n",
|
||
"16 DR Congo 89561403 50000000\n",
|
||
"17 Turkey 84339067 50000000\n",
|
||
"18 Iran 83992949 50000000\n",
|
||
"19 Germany 83783942 50000000\n",
|
||
"20 Thailand 69799978 50000000\n",
|
||
"21 United Kingdom 67886011 50000000\n",
|
||
"22 France 65273511 50000000\n",
|
||
"23 Italy 60461826 50000000\n",
|
||
"24 Tanzania 59734218 50000000\n",
|
||
"25 South Africa 59308690 50000000\n",
|
||
"26 Myanmar 54409800 50000000\n",
|
||
"27 Kenya 53771296 50000000\n",
|
||
"28 South Korea 51269185 50000000\n",
|
||
"29 Colombia 50882891 50000000"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries_norm = countries.copy()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000)\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
|
||
" [\"Country\", \"Population2020\", \"PopulationClip\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Винсоризация признака Возраст"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"111195830.99999991\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationWinsorized</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationWinsorized\n",
|
||
"no \n",
|
||
"1 China 1439323776 114963588\n",
|
||
"2 India 1380004385 114963588\n",
|
||
"3 United States 331002651 114963588\n",
|
||
"4 Indonesia 273523615 114963588\n",
|
||
"5 Pakistan 220892340 114963588\n",
|
||
"6 Brazil 212559417 114963588\n",
|
||
"7 Nigeria 206139589 114963588\n",
|
||
"8 Bangladesh 164689383 114963588\n",
|
||
"9 Russia 145934462 114963588\n",
|
||
"10 Mexico 128932753 114963588\n",
|
||
"11 Japan 126476461 114963588\n",
|
||
"12 Ethiopia 114963588 114963588\n",
|
||
"13 Philippines 109581078 109581078\n",
|
||
"14 Egypt 102334404 102334404\n",
|
||
"15 Vietnam 97338579 97338579\n",
|
||
"16 DR Congo 89561403 89561403\n",
|
||
"17 Turkey 84339067 84339067\n",
|
||
"18 Iran 83992949 83992949\n",
|
||
"19 Germany 83783942 83783942\n",
|
||
"20 Thailand 69799978 69799978\n",
|
||
"21 United Kingdom 67886011 67886011\n",
|
||
"22 France 65273511 65273511\n",
|
||
"23 Italy 60461826 60461826\n",
|
||
"24 Tanzania 59734218 59734218\n",
|
||
"25 South Africa 59308690 59308690\n",
|
||
"26 Myanmar 54409800 54409800\n",
|
||
"27 Kenya 53771296 53771296\n",
|
||
"28 South Korea 51269185 51269185\n",
|
||
"29 Colombia 50882891 50882891"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.stats.mstats import winsorize\n",
|
||
"\n",
|
||
"print(countries_norm[\"Population2020\"].quantile(q=0.95))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
|
||
" countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n",
|
||
" (0, 0.05),\n",
|
||
" inplace=False,\n",
|
||
")\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
|
||
" [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Нормализация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationNorm</th>\n",
|
||
" <th>PopulationClipNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm2</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>1.000000e+00</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.587866e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.299705e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.900357e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.534691e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>2.911786e-06</td>\n",
|
||
" <td>0.000084</td>\n",
|
||
" <td>0.000036</td>\n",
|
||
" <td>-0.999927</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>1.861292e-06</td>\n",
|
||
" <td>0.000054</td>\n",
|
||
" <td>0.000023</td>\n",
|
||
" <td>-0.999953</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>5.731862e-07</td>\n",
|
||
" <td>0.000017</td>\n",
|
||
" <td>0.000007</td>\n",
|
||
" <td>-0.999986</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>3.862927e-07</td>\n",
|
||
" <td>0.000011</td>\n",
|
||
" <td>0.000005</td>\n",
|
||
" <td>-0.999990</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationNorm PopulationClipNorm \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 1.000000e+00 1.000000 \n",
|
||
"2 India 1380004385 9.587866e-01 1.000000 \n",
|
||
"3 United States 331002651 2.299705e-01 1.000000 \n",
|
||
"4 Indonesia 273523615 1.900357e-01 1.000000 \n",
|
||
"5 Pakistan 220892340 1.534691e-01 1.000000 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 2.911786e-06 0.000084 \n",
|
||
"232 Falkland Islands 3480 1.861292e-06 0.000054 \n",
|
||
"233 Niue 1626 5.731862e-07 0.000017 \n",
|
||
"234 Tokelau 1357 3.862927e-07 0.000011 \n",
|
||
"235 Holy See 801 0.000000e+00 0.000000 \n",
|
||
"\n",
|
||
" PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
|
||
"no \n",
|
||
"1 1.000000 1.000000 \n",
|
||
"2 1.000000 1.000000 \n",
|
||
"3 1.000000 1.000000 \n",
|
||
"4 1.000000 1.000000 \n",
|
||
"5 1.000000 1.000000 \n",
|
||
".. ... ... \n",
|
||
"231 0.000036 -0.999927 \n",
|
||
"232 0.000023 -0.999953 \n",
|
||
"233 0.000007 -0.999986 \n",
|
||
"234 0.000005 -0.999990 \n",
|
||
"235 0.000000 -1.000000 \n",
|
||
"\n",
|
||
"[235 rows x 6 columns]"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
||
"\n",
|
||
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country\",\n",
|
||
" \"Population2020\",\n",
|
||
" \"PopulationNorm\",\n",
|
||
" \"PopulationClipNorm\",\n",
|
||
" \"PopulationWinsorizedNorm\",\n",
|
||
" \"PopulationWinsorizedNorm2\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Стандартизация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationStand</th>\n",
|
||
" <th>PopulationClipStand</th>\n",
|
||
" <th>PopulationWinsorizedStand</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>10.427597</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.987702</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.208627</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.782380</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.392082</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>-0.245950</td>\n",
|
||
" <td>-0.795071</td>\n",
|
||
" <td>-0.621969</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>-0.245962</td>\n",
|
||
" <td>-0.795158</td>\n",
|
||
" <td>-0.622019</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>-0.245975</td>\n",
|
||
" <td>-0.795265</td>\n",
|
||
" <td>-0.622080</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>-0.245977</td>\n",
|
||
" <td>-0.795280</td>\n",
|
||
" <td>-0.622089</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>-0.245982</td>\n",
|
||
" <td>-0.795312</td>\n",
|
||
" <td>-0.622107</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationStand PopulationClipStand \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 10.427597 2.073933 \n",
|
||
"2 India 1380004385 9.987702 2.073933 \n",
|
||
"3 United States 331002651 2.208627 2.073933 \n",
|
||
"4 Indonesia 273523615 1.782380 2.073933 \n",
|
||
"5 Pakistan 220892340 1.392082 2.073933 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 -0.245950 -0.795071 \n",
|
||
"232 Falkland Islands 3480 -0.245962 -0.795158 \n",
|
||
"233 Niue 1626 -0.245975 -0.795265 \n",
|
||
"234 Tokelau 1357 -0.245977 -0.795280 \n",
|
||
"235 Holy See 801 -0.245982 -0.795312 \n",
|
||
"\n",
|
||
" PopulationWinsorizedStand \n",
|
||
"no \n",
|
||
"1 3.171659 \n",
|
||
"2 3.171659 \n",
|
||
"3 3.171659 \n",
|
||
"4 3.171659 \n",
|
||
"5 3.171659 \n",
|
||
".. ... \n",
|
||
"231 -0.621969 \n",
|
||
"232 -0.622019 \n",
|
||
"233 -0.622080 \n",
|
||
"234 -0.622089 \n",
|
||
"235 -0.622107 \n",
|
||
"\n",
|
||
"[235 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"stndart_scaler = preprocessing.StandardScaler()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country\",\n",
|
||
" \"Population2020\",\n",
|
||
" \"PopulationStand\",\n",
|
||
" \"PopulationClipStand\",\n",
|
||
" \"PopulationWinsorizedStand\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.2"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|