3468 lines
123 KiB
Plaintext
3468 lines
123 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование\n",
|
||
"\n",
|
||
"Преобразование категориального признака в несколько бинарных признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка набора данных World Population"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>Yearly</th>\n",
|
||
" <th>NetChange</th>\n",
|
||
" <th>Density</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>Migrants</th>\n",
|
||
" <th>FertRate</th>\n",
|
||
" <th>MedAge</th>\n",
|
||
" <th>UrbanPop</th>\n",
|
||
" <th>WorldShare</th>\n",
|
||
" <th>Net Change</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5,540,090</td>\n",
|
||
" <td>153</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>-348,399</td>\n",
|
||
" <td>1.7</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>61%</td>\n",
|
||
" <td>18.47%</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13,586,631</td>\n",
|
||
" <td>464</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>-532,687</td>\n",
|
||
" <td>2.2</td>\n",
|
||
" <td>28</td>\n",
|
||
" <td>35%</td>\n",
|
||
" <td>17.70%</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1,937,734</td>\n",
|
||
" <td>36</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>954,806</td>\n",
|
||
" <td>1.8</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>83%</td>\n",
|
||
" <td>4.25%</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2,898,047</td>\n",
|
||
" <td>151</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>-98,955</td>\n",
|
||
" <td>2.3</td>\n",
|
||
" <td>30</td>\n",
|
||
" <td>56%</td>\n",
|
||
" <td>3.51%</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4,327,022</td>\n",
|
||
" <td>287</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>-233,379</td>\n",
|
||
" <td>3.6</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>35%</td>\n",
|
||
" <td>2.83%</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>10%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>66%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>103</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>46%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>136</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>0%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2,003</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 12 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 Yearly NetChange Density LandArea \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5,540,090 153 9388211 \n",
|
||
"2 India 1380004385 0.99 13,586,631 464 2973190 \n",
|
||
"3 United States 331002651 0.59 1,937,734 36 9147420 \n",
|
||
"4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n",
|
||
"5 Pakistan 220892340 2.00 4,327,022 287 770880 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 50 100 \n",
|
||
"232 Falkland Islands 3480 3.05 103 0 12170 \n",
|
||
"233 Niue 1626 0.68 11 6 260 \n",
|
||
"234 Tokelau 1357 1.27 17 136 10 \n",
|
||
"235 Holy See 801 0.25 2 2,003 0 \n",
|
||
"\n",
|
||
" Migrants FertRate MedAge UrbanPop WorldShare Net Change \n",
|
||
"no \n",
|
||
"1 -348,399 1.7 38 61% 18.47% 5540090 \n",
|
||
"2 -532,687 2.2 28 35% 17.70% 13586631 \n",
|
||
"3 954,806 1.8 38 83% 4.25% 1937734 \n",
|
||
"4 -98,955 2.3 30 56% 3.51% 2898047 \n",
|
||
"5 -233,379 3.6 23 35% 2.83% 4327022 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"231 NaN N.A. N.A. 10% 0.00% 3 \n",
|
||
"232 NaN N.A. N.A. 66% 0.00% 103 \n",
|
||
"233 NaN N.A. N.A. 46% 0.00% 11 \n",
|
||
"234 NaN N.A. N.A. 0% 0.00% 17 \n",
|
||
"235 NaN N.A. N.A. N.A. 0.00% 2 \n",
|
||
"\n",
|
||
"[235 rows x 12 columns]"
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"countries = pd.read_csv(\n",
|
||
" \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n",
|
||
")\n",
|
||
"\n",
|
||
"countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Кодирование"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
||
"\n",
|
||
"# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
||
"\n",
|
||
"# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"# encoded_values_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавление признаков в исходный Dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"# titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Дискретизация признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"num_bins = 3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0. , 5458956.66666667, 10917913.33333333,\n",
|
||
" 16376870. ]),\n",
|
||
" array([229, 5, 1]))"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist1, bins1 = np.histogram(\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n",
|
||
")\n",
|
||
"bins1, hist1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>(10917913.333, 16376870.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (5458956.667, 10917913.333]\n",
|
||
"2 2973190 (0.0, 5458956.667]\n",
|
||
"3 9147420 (5458956.667, 10917913.333]\n",
|
||
"4 1811570 (0.0, 5458956.667]\n",
|
||
"5 770880 (0.0, 5458956.667]\n",
|
||
"6 8358140 (5458956.667, 10917913.333]\n",
|
||
"7 910770 (0.0, 5458956.667]\n",
|
||
"8 130170 (0.0, 5458956.667]\n",
|
||
"9 16376870 (10917913.333, 16376870.0]\n",
|
||
"10 1943950 (0.0, 5458956.667]\n",
|
||
"11 364555 (0.0, 5458956.667]\n",
|
||
"12 1000000 (0.0, 5458956.667]\n",
|
||
"13 298170 (0.0, 5458956.667]\n",
|
||
"14 995450 (0.0, 5458956.667]\n",
|
||
"15 310070 (0.0, 5458956.667]\n",
|
||
"16 2267050 (0.0, 5458956.667]\n",
|
||
"17 769630 (0.0, 5458956.667]\n",
|
||
"18 1628550 (0.0, 5458956.667]\n",
|
||
"19 348560 (0.0, 5458956.667]\n",
|
||
"20 510890 (0.0, 5458956.667]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Middle\n",
|
||
"2 2973190 Small\n",
|
||
"3 9147420 Middle\n",
|
||
"4 1811570 Small\n",
|
||
"5 770880 Small\n",
|
||
"6 8358140 Middle\n",
|
||
"7 910770 Small\n",
|
||
"8 130170 Small\n",
|
||
"9 16376870 Big\n",
|
||
"10 1943950 Small\n",
|
||
"11 364555 Small\n",
|
||
"12 1000000 Small\n",
|
||
"13 298170 Small\n",
|
||
"14 995450 Small\n",
|
||
"15 310070 Small\n",
|
||
"16 2267050 Small\n",
|
||
"17 769630 Small\n",
|
||
"18 1628550 Small\n",
|
||
"19 348560 Small\n",
|
||
"20 510890 Small"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0., 4000000., 8000000., 12000000.]),\n",
|
||
" array([229, 1, 4, 1]))"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"bins2 = np.linspace(0, 12000000, 4)\n",
|
||
"\n",
|
||
"tmp_bins2 = np.digitize(\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n",
|
||
")\n",
|
||
"\n",
|
||
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
||
"\n",
|
||
"bins2, hist2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (8000000.0, 12000000.0]\n",
|
||
"2 2973190 (0.0, 4000000.0]\n",
|
||
"3 9147420 (8000000.0, 12000000.0]\n",
|
||
"4 1811570 (0.0, 4000000.0]\n",
|
||
"5 770880 (0.0, 4000000.0]\n",
|
||
"6 8358140 (8000000.0, 12000000.0]\n",
|
||
"7 910770 (0.0, 4000000.0]\n",
|
||
"8 130170 (0.0, 4000000.0]\n",
|
||
"9 16376870 NaN\n",
|
||
"10 1943950 (0.0, 4000000.0]\n",
|
||
"11 364555 (0.0, 4000000.0]\n",
|
||
"12 1000000 (0.0, 4000000.0]\n",
|
||
"13 298170 (0.0, 4000000.0]\n",
|
||
"14 995450 (0.0, 4000000.0]\n",
|
||
"15 310070 (0.0, 4000000.0]\n",
|
||
"16 2267050 (0.0, 4000000.0]\n",
|
||
"17 769630 (0.0, 4000000.0]\n",
|
||
"18 1628550 (0.0, 4000000.0]\n",
|
||
"19 348560 (0.0, 4000000.0]\n",
|
||
"20 510890 (0.0, 4000000.0]"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Big\n",
|
||
"2 2973190 Small\n",
|
||
"3 9147420 Big\n",
|
||
"4 1811570 Small\n",
|
||
"5 770880 Small\n",
|
||
"6 8358140 Big\n",
|
||
"7 910770 Small\n",
|
||
"8 130170 Small\n",
|
||
"9 16376870 NaN\n",
|
||
"10 1943950 Small\n",
|
||
"11 364555 Small\n",
|
||
"12 1000000 Small\n",
|
||
"13 298170 Small\n",
|
||
"14 995450 Small\n",
|
||
"15 310070 Small\n",
|
||
"16 2267050 Small\n",
|
||
"17 769630 Small\n",
|
||
"18 1628550 Small\n",
|
||
"19 348560 Small\n",
|
||
"20 510890 Small"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n",
|
||
" array([52, 77, 56, 44, 6]))"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
|
||
"hist3, bins3 = np.histogram(\n",
|
||
"\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"bins3, hist3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (3000000.0, inf]\n",
|
||
"2 2973190 (500000.0, 3000000.0]\n",
|
||
"3 9147420 (3000000.0, inf]\n",
|
||
"4 1811570 (500000.0, 3000000.0]\n",
|
||
"5 770880 (500000.0, 3000000.0]\n",
|
||
"6 8358140 (3000000.0, inf]\n",
|
||
"7 910770 (500000.0, 3000000.0]\n",
|
||
"8 130170 (100000.0, 500000.0]\n",
|
||
"9 16376870 (3000000.0, inf]\n",
|
||
"10 1943950 (500000.0, 3000000.0]\n",
|
||
"11 364555 (100000.0, 500000.0]\n",
|
||
"12 1000000 (500000.0, 3000000.0]\n",
|
||
"13 298170 (100000.0, 500000.0]\n",
|
||
"14 995450 (500000.0, 3000000.0]\n",
|
||
"15 310070 (100000.0, 500000.0]\n",
|
||
"16 2267050 (500000.0, 3000000.0]\n",
|
||
"17 769630 (500000.0, 3000000.0]\n",
|
||
"18 1628550 (500000.0, 3000000.0]\n",
|
||
"19 348560 (100000.0, 500000.0]\n",
|
||
"20 510890 (500000.0, 3000000.0]"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Giant\n",
|
||
"2 2973190 Big\n",
|
||
"3 9147420 Giant\n",
|
||
"4 1811570 Big\n",
|
||
"5 770880 Big\n",
|
||
"6 8358140 Giant\n",
|
||
"7 910770 Big\n",
|
||
"8 130170 Middle\n",
|
||
"9 16376870 Giant\n",
|
||
"10 1943950 Big\n",
|
||
"11 364555 Middle\n",
|
||
"12 1000000 Big\n",
|
||
"13 298170 Middle\n",
|
||
"14 995450 Big\n",
|
||
"15 310070 Middle\n",
|
||
"16 2267050 Big\n",
|
||
"17 769630 Big\n",
|
||
"18 1628550 Big\n",
|
||
"19 348560 Middle\n",
|
||
"20 510890 Big"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Квантильное разделение данных на 5 групп\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 4\n",
|
||
"2 2973190 4\n",
|
||
"3 9147420 4\n",
|
||
"4 1811570 4\n",
|
||
"5 770880 4\n",
|
||
"6 8358140 4\n",
|
||
"7 910770 4\n",
|
||
"8 130170 2\n",
|
||
"9 16376870 4\n",
|
||
"10 1943950 4\n",
|
||
"11 364555 3\n",
|
||
"12 1000000 4\n",
|
||
"13 298170 3\n",
|
||
"14 995450 4\n",
|
||
"15 310070 3\n",
|
||
"16 2267050 4\n",
|
||
"17 769630 4\n",
|
||
"18 1628550 4\n",
|
||
"19 348560 3\n",
|
||
"20 510890 3"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Giant\n",
|
||
"2 2973190 Giant\n",
|
||
"3 9147420 Giant\n",
|
||
"4 1811570 Giant\n",
|
||
"5 770880 Giant\n",
|
||
"6 8358140 Giant\n",
|
||
"7 910770 Giant\n",
|
||
"8 130170 Middle\n",
|
||
"9 16376870 Giant\n",
|
||
"10 1943950 Giant\n",
|
||
"11 364555 Big\n",
|
||
"12 1000000 Giant\n",
|
||
"13 298170 Big\n",
|
||
"14 995450 Giant\n",
|
||
"15 310070 Big\n",
|
||
"16 2267050 Giant\n",
|
||
"17 769630 Giant\n",
|
||
"18 1628550 Giant\n",
|
||
"19 348560 Big\n",
|
||
"20 510890 Big"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример конструирования признаков на основе существующих\n",
|
||
"\n",
|
||
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
|
||
"\n",
|
||
"Is_married - замужняя ли женщина\n",
|
||
"\n",
|
||
"Cabin_type - палуба (тип каюты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic_cl = titanic.drop(\n",
|
||
"# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
|
||
"# )\n",
|
||
"# titanic_cl = titanic_cl.dropna()\n",
|
||
"\n",
|
||
"# titanic_cl[\"Title\"] = [\n",
|
||
"# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
|
||
"# ]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
|
||
"\n",
|
||
"# titanic_cl"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
||
"\n",
|
||
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1379: SyntaxWarning: invalid escape sequence '\\l'\n",
|
||
" columns_string = \"\\l\".join(column_typing_info) # noqa: W605\n",
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1381: SyntaxWarning: invalid escape sequence '\\l'\n",
|
||
" label = \"{%s (%d row%s)|%s\\l}\" % ( # noqa: W605\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"( no Country Population2020 Yearly NetChange Density \\\n",
|
||
" 0 1 China 1439323776 0.39 5540090 153 \n",
|
||
" 1 2 India 1380004385 0.99 13586631 464 \n",
|
||
" 2 3 United States 331002651 0.59 1937734 36 \n",
|
||
" 3 4 Indonesia 273523615 1.07 2898047 151 \n",
|
||
" 4 5 Pakistan 220892340 2.00 4327022 287 \n",
|
||
" .. ... ... ... ... ... ... \n",
|
||
" 230 231 Montserrat 4992 0.06 3 50 \n",
|
||
" 231 232 Falkland Islands 3480 3.05 103 0 \n",
|
||
" 232 233 Niue 1626 0.68 11 6 \n",
|
||
" 233 234 Tokelau 1357 1.27 17 136 \n",
|
||
" 234 235 Holy See 801 0.25 2 2,003 \n",
|
||
" \n",
|
||
" LandArea \n",
|
||
" 0 9388211 \n",
|
||
" 1 2973190 \n",
|
||
" 2 9147420 \n",
|
||
" 3 1811570 \n",
|
||
" 4 770880 \n",
|
||
" .. ... \n",
|
||
" 230 100 \n",
|
||
" 231 12170 \n",
|
||
" 232 260 \n",
|
||
" 233 10 \n",
|
||
" 234 0 \n",
|
||
" \n",
|
||
" [235 rows x 7 columns],\n",
|
||
" Year Population YearlyPer Yearly Median Fertility Density\n",
|
||
" 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
|
||
" 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
|
||
" 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
|
||
" 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
|
||
" 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
|
||
" 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
|
||
" 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
|
||
" Country Capital Continent\n",
|
||
" 0 Afghanistan Kabul Asia\n",
|
||
" 1 Albania Tirana Europe\n",
|
||
" 2 Algeria Algiers Africa\n",
|
||
" 3 American Samoa Pago Pago Oceania\n",
|
||
" 4 Andorra Andorra la Vella Europe\n",
|
||
" .. ... ... ...\n",
|
||
" 229 Wallis and Futuna Mata-Utu Oceania\n",
|
||
" 230 Western Sahara El Aai?�n Africa\n",
|
||
" 231 Yemen Sanaa Asia\n",
|
||
" 232 Zambia Lusaka Africa\n",
|
||
" 233 Zimbabwe Harare Africa\n",
|
||
" \n",
|
||
" [234 rows x 3 columns])"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"from woodwork.logical_types import Categorical, Datetime\n",
|
||
"\n",
|
||
"info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n",
|
||
"forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n",
|
||
"capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n",
|
||
"forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n",
|
||
"info[\"Population2020\"] = info[\"Population2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"Yearly\"] = info[\"Yearly\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"info[\"NetChange\"] = info[\"NetChange\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"LandArea\"] = info[\"LandArea\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"\n",
|
||
"info, forcast, capitals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Создание сущностей в featuretools\n",
|
||
"\n",
|
||
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" No relationships"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = ft.EntitySet(id=\"countries\")\n",
|
||
"\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"countries\",\n",
|
||
" dataframe=info,\n",
|
||
" index=\"no\",\n",
|
||
" logical_types={\n",
|
||
" \"Country\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"capitals\",\n",
|
||
" dataframe=capitals,\n",
|
||
" index=\"Country\",\n",
|
||
" logical_types={\n",
|
||
" \"Country\": Categorical,\n",
|
||
" \"Capital\": Categorical,\n",
|
||
" \"Continent\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"forcast\",\n",
|
||
" dataframe=forcast,\n",
|
||
" index=\"forcast_id\",\n",
|
||
" make_index=True,\n",
|
||
" logical_types={\n",
|
||
" \"Year\": Datetime,\n",
|
||
" },\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Настройка связей между сущностями featuretools\n",
|
||
"\n",
|
||
"Настройка связей между таблицами на уровне ключей\n",
|
||
"\n",
|
||
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" countries.Country -> capitals.Country"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
||
"\n",
|
||
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
||
"\n",
|
||
"Результат помещается в Dataframe feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>Yearly</th>\n",
|
||
" <th>NetChange</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>capitals.Capital</th>\n",
|
||
" <th>capitals.Continent</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Beijing</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>New Delhi</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Washington, D.C.</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Jakarta</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Islamabad</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>Brades</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>Stanley</td>\n",
|
||
" <td>South America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>Alofi</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Nukunonu</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 Yearly NetChange LandArea \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5540090 9388211 \n",
|
||
"2 India 1380004385 0.99 13586631 2973190 \n",
|
||
"3 United States 331002651 0.59 1937734 9147420 \n",
|
||
"4 Indonesia 273523615 1.07 2898047 1811570 \n",
|
||
"5 Pakistan 220892340 2.00 4327022 770880 \n",
|
||
".. ... ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 100 \n",
|
||
"232 Falkland Islands 3480 3.05 103 12170 \n",
|
||
"233 Niue 1626 0.68 11 260 \n",
|
||
"234 Tokelau 1357 1.27 17 10 \n",
|
||
"235 Holy See 801 0.25 2 0 \n",
|
||
"\n",
|
||
" capitals.Capital capitals.Continent \n",
|
||
"no \n",
|
||
"1 Beijing Asia \n",
|
||
"2 New Delhi Asia \n",
|
||
"3 Washington, D.C. North America \n",
|
||
"4 Jakarta Asia \n",
|
||
"5 Islamabad Asia \n",
|
||
".. ... ... \n",
|
||
"231 Brades North America \n",
|
||
"232 Stanley South America \n",
|
||
"233 Alofi Oceania \n",
|
||
"234 Nukunonu Oceania \n",
|
||
"235 NaN NaN \n",
|
||
"\n",
|
||
"[235 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name=\"countries\",\n",
|
||
" max_depth=1,\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Полученные признаки\n",
|
||
"\n",
|
||
"Список колонок полученного dataframe'а"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[<Feature: Country>,\n",
|
||
" <Feature: Population2020>,\n",
|
||
" <Feature: Yearly>,\n",
|
||
" <Feature: NetChange>,\n",
|
||
" <Feature: LandArea>,\n",
|
||
" <Feature: capitals.Capital>,\n",
|
||
" <Feature: capitals.Continent>]"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_defs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Отсечение значений признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определение выбросов с помощью boxplot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGsCAYAAAAPJKchAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvfklEQVR4nO3df1hVZb7//9dmAxsoQU0FNQyL0rokZXQknEhMfqTFxHhVTnbUPOX51tS5LPKcwknNLDFF08oOx2a0PFNqmZLHzORQKiXlEaN0Jk1TYzJBraNbwWC7Wd8//LBHApStuG/ZPB/X5XWx7nWvtd575lp7v1rrXuu2WZZlCQAAwJAA0wUAAIC2jTACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjGpVYWTTpk3KyMhQt27dZLPZlJ+f7/U+3n77bfXr109hYWG66qqrNHv27JYvFAAANFurCiOVlZXq27evFixYcF7bf/DBB7rvvvv00EMPaceOHXr11Vf14osv6pVXXmnhSgEAQHPZWutEeTabTatWrVJmZqanrbq6Wn/84x+1dOlSHT16VH369NELL7yg5ORkSdKoUaPkcrn0zjvveLZ5+eWXNWvWLJWVlclms/n4UwAAgFZ1ZeRcHn30URUXF2vZsmX66quvdPfdd+u2227T7t27JZ0OKyEhIfW2CQ0N1ffff6/vvvvORMkAALR5fhNGysrKtHjxYr3zzjtKSkrSNddco4kTJ+rmm2/W4sWLJUnp6elauXKlCgsLVVtbq2+++UZz5syRJB08eNBk+QAAtFmBpgtoKdu3b5fb7dZ1111Xr726ulpXXHGFJGn8+PH69ttvdccdd8jlcik8PFwTJkzQM888o4AAv8llAAC0Kn4TRk6cOCG73a6SkhLZ7fZ66y6//HJJp8eZvPDCC5oxY4bKy8vVuXNnFRYWSpKuvvpqn9cMAAD8KIzEx8fL7Xbr0KFDSkpKOmtfu92u7t27S5KWLl2qxMREde7c2RdlAgCAX2hVYeTEiRPas2ePZ3nfvn0qLS1Vx44ddd111+m+++7TmDFjNGfOHMXHx+vw4cMqLCzUjTfeqNtvv11HjhzRihUrlJycrJ9//tkzxmTjxo0GPxUAAG1bq3q0d8OGDRoyZEiD9rFjx+r111+Xy+XSc889pyVLlujAgQPq1KmTbrrpJk2bNk1xcXE6cuSIMjIytH37dlmWpcTERD3//PNKSEgw8GkAAIDUysIIAADwPzxCAgAAjCKMAAAAo1rFANba2lr98MMPateuHa9sBwCglbAsS8ePH1e3bt3O+j6vVhFGfvjhB0VHR5suAwAAnIe///3vuvLKK5tc3yrCSLt27SSd/jDh4eGGqwHQklwul9avX6+0tDQFBQWZLgdAC3I6nYqOjvb8jjelVYSRulsz4eHhhBHAz7hcLoWFhSk8PJwwAvipcw2xYAArAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCABj3G63Nm7cqE2bNmnjxo1yu92mSwJgAGEEgBErV65UbGysUlNTNXfuXKWmpio2NlYrV640XRoAHyOMAPC5lStX6q677lJcXJyKioq0dOlSFRUVKS4uTnfddReBBGhjbJZlWaaLOBen06mIiAgdO3aMuWmAVs7tdis2NlZxcXHKz8+X2+3W2rVrNXz4cNntdmVmZmrHjh3avXu37Ha76XIBXIDm/n5zZQSATxUVFWn//v2aNGmSAgLqfwUFBAQoOztb+/btU1FRkaEKAfgaYQSATx08eFCS1KdPn0bX17XX9QPg/wgjAHyqa9eukqQdO3Y0ur6uva4fAP9HGAHgU0lJSYqJidGMGTNUW1tbb11tba1ycnLUs2dPJSUlGaoQgK8RRgD4lN1u15w5c7RmzRplZmbqs88+08mTJ/XZZ58pMzNTa9asUW5uLoNXgTYk0HQBANqeESNGaMWKFXriiSd0yy23eNp79uypFStWaMSIEQarA+BrPNoLwBi3262PP/5YH3zwgYYNG6YhQ4ZwRQTwIxft0d5NmzYpIyND3bp1k81mU35+frO3/fTTTxUYGKh+/fp5e1gAfshut2vw4MG65ZZbNHjwYIII0EZ5HUYqKyvVt29fLViwwKvtjh49qjFjxmjo0KHeHhIAAPgxr8eMDBs2TMOGDfP6QA899JBGjRolu93u1dUUAADg33wygHXx4sXau3ev/vKXv+i55547Z//q6mpVV1d7lp1OpyTJ5XLJ5XJdtDoB+Jbb7daGDRu0adMmORwOJScnc6sG8CPN/c2+6GFk9+7deuqpp1RUVKTAwOYdLicnR9OmTWvQvn79eoWFhbV0iQAMKC4u1uLFi3Xo0CFJ0ty5c9WlSxeNGzdOiYmJhqsD0BKqqqqa1e+ihhG3261Ro0Zp2rRpuu6665q9XXZ2trKysjzLTqdT0dHRSktL42kawA+sWrVKs2bN0vDhwzVx4kSVl5crKipKubm5mjVrlpYtW6bf/e53pssEcIHq7mycywU92muz2bRq1SplZmY2uv7o0aPq0KFDvcuutbW1sixLdrtd69ev16233nrO4/BoL+A/mLUXaDua+/t9Ua+MhIeHa/v27fXaXn31VX300UdasWKFevbseTEPD+ASVDdr79KlSxUQECC32+1ZVzdr76BBg1RUVKTk5GRzhQLwGa/DyIkTJ7Rnzx7P8r59+1RaWqqOHTuqR48eys7O1oEDB7RkyRIFBAQ0mJmzS5cuCgkJaXLGTgD+jVl7AfyS1+8Z2bp1q+Lj4xUfHy9JysrKUnx8vKZMmSLp9BdIWVlZy1YJwG8way+AX+J18AB8ijEjQNtx0V4HDwAXgll7AfwSs/YC8Dlm7QVwJm7TADCGWXsB/3ZJPNoLAGdTN2tvZWUls/YCbRhjRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAY5XUY2bRpkzIyMtStWzfZbDbl5+eftf/KlSuVmpqqzp07Kzw8XImJifrwww/Pt14AAOBnvA4jlZWV6tu3rxYsWNCs/ps2bVJqaqrWrl2rkpISDRkyRBkZGfriiy+8LhYAAPifQG83GDZsmIYNG9bs/vPmzau3PGPGDL333nv67//+b8XHx3t7eAAA4Ge8DiMXqra2VsePH1fHjh2b7FNdXa3q6mrPstPplCS5XC65XK6LXiMA36k7pzm3Af/T3PPa52EkNzdXJ06c0D333NNkn5ycHE2bNq1B+/r16xUWFnYxywNgSEFBgekSALSwqqqqZvWzWZZlne9BbDabVq1apczMzGb1f+uttzR+/Hi99957SklJabJfY1dGoqOjdeTIEYWHh59vuQAuQS6XSwUFBUpNTVVQUJDpcgC0IKfTqU6dOunYsWNn/f322ZWRZcuW6cEHH9Q777xz1iAiSQ6HQw6Ho0F7UFAQX1aAn+L8BvxPc89pn7xnZOnSpRo3bpyWLl2q22+/3ReHBAAArYTXV0ZOnDihPXv2eJb37dun0tJSdezYUT169FB2drYOHDigJUuWSDp9a2bs2LGaP3++EhISVF5eLkkKDQ1VREREC30MAADQWnl9ZWTr1q2Kj4/3PJablZWl+Ph4TZkyRZJ08OBBlZWVefovXLhQp06d0iOPPKKuXbt6/k2YMKGFPgIAAGjNvL4ykpycrLONeX399dfrLW/YsMHbQwAAgDaEuWkAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGeR1GNm3apIyMDHXr1k02m035+fnn3GbDhg361a9+JYfDodjYWL3++uvnUSoAAPBHXoeRyspK9e3bVwsWLGhW/3379un222/XkCFDVFpaqscee0wPPvigPvzwQ6+LBQAA/ifQ2w2GDRumYcOGNbt/Xl6eevbsqTlz5kiSrr/+en3yySd68cUXlZ6e7u3hAQCAn/E6jHiruLhYKSkp9drS09P12GOPNblNdXW1qqurPctOp1OS5HK55HK5LkqdAMyoO6c5twH/09zz+qKHkfLyckVGRtZri4yMlNPp1MmTJxUaGtpgm5ycHE2bNq1B+/r16xUWFnbRagVgTkFBgekSALSwqqqqZvW76GHkfGRnZysrK8uz7HQ6FR0drbS0NIWHhxusDEBLc7lcKigoUGpqqoKCgkyXA6AF1d3ZOJeLHkaioqJUUVFRr62iokLh4eGNXhWRJIfDIYfD0aA9KCiILyvAT3F+A/6nuef0RX/PSGJiogoLC+u1FRQUKDEx8WIfGgAAtAJeh5ETJ06otLRUpaWlkk4/ultaWqqysjJJp2+xjBkzxtP/oYce0t69e/Xv//7v2rlzp1599VW9/fbbevzxx1vmEwAAgFbN6zCydetWxcfHKz4+XpKUlZWl+Ph4TZkyRZJ08OBBTzCRpJ49e+r9999XQUGB+vbtqzlz5uhPf/oTj/UCAABJ5zFmJDk5WZZlNbm+sberJicn64svvvD2UAAAoA1gbhoAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBR5xVGFixYoJiYGIWEhCghIUFbtmw5a/958+apV69eCg0NVXR0tB5//HH9/PPP51UwAADwL16HkeXLlysrK0tTp07Vtm3b1LdvX6Wnp+vQoUON9n/rrbf01FNPaerUqfr666/15z//WcuXL9ekSZMuuHgAAND6eR1G5s6dq/Hjx2vcuHG64YYblJeXp7CwMC1atKjR/ps3b9ZvfvMbjRo1SjExMUpLS9O99957zqspAACgbQj0pnNNTY1KSkqUnZ3taQsICFBKSoqKi4sb3WbQoEH6y1/+oi1btmjgwIHau3ev1q5dq9GjRzd5nOrqalVXV3uWnU6nJMnlcsnlcnlTMoBLXN05zbkN+J/mntdehZEjR47I7XYrMjKyXntkZKR27tzZ6DajRo3SkSNHdPPNN8uyLJ06dUoPPfTQWW/T5OTkaNq0aQ3a169fr7CwMG9KBtBKFBQUmC4BQAurqqpqVj+vwsj52LBhg2bMmKFXX31VCQkJ2rNnjyZMmKDp06dr8uTJjW6TnZ2trKwsz7LT6VR0dLTS0tIUHh5+sUsG4EMul0sFBQVKTU1VUFCQ6XIAtKC6Oxvn4lUY6dSpk+x2uyoqKuq1V1RUKCoqqtFtJk+erNGjR+vBBx+UJMXFxamyslL/8i//oj/+8Y8KCGg4bMXhcMjhcDRoDwoK4ssK8FOc34D/ae457dUA1uDgYPXv31+FhYWettraWhUWFioxMbHRbaqqqhoEDrvdLkmyLMubwwMAAD/k9W2arKwsjR07VgMGDNDAgQM1b948VVZWaty4cZKkMWPGqHv37srJyZEkZWRkaO7cuYqPj/fcppk8ebIyMjI8oQQAALRdXoeRkSNH6vDhw5oyZYrKy8vVr18/rVu3zjOotaysrN6VkKefflo2m01PP/20Dhw4oM6dOysjI0PPP/98y30KAADQatmsVnCvxOl0KiIiQseOHWMAK+BnXC6X1q5dq+HDhzNmBPAzzf39Zm4aAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARp1XGFmwYIFiYmIUEhKihIQEbdmy5az9jx49qkceeURdu3aVw+HQddddp7Vr155XwQAAwL8EervB8uXLlZWVpby8PCUkJGjevHlKT0/Xrl271KVLlwb9a2pqlJqaqi5dumjFihXq3r27vvvuO7Vv374l6gcAAK2c12Fk7ty5Gj9+vMaNGydJysvL0/vvv69FixbpqaeeatB/0aJF+umnn7R582YFBQVJkmJiYi6sagAA4De8CiM1NTUqKSlRdna2py0gIEApKSkqLi5udJvVq1crMTFRjzzyiN577z117txZo0aN0pNPPim73d7oNtXV1aqurvYsO51OSZLL5ZLL5fKmZACXuLpzmnMb8D/NPa+9CiNHjhyR2+1WZGRkvfbIyEjt3Lmz0W327t2rjz76SPfdd5/Wrl2rPXv26A9/+INcLpemTp3a6DY5OTmaNm1ag/b169crLCzMm5IBtBIFBQWmSwDQwqqqqprVz+vbNN6qra1Vly5dtHDhQtntdvXv318HDhzQ7Nmzmwwj2dnZysrK8iw7nU5FR0crLS1N4eHhF7tkAD7kcrlUUFCg1NRUz61cAP6h7s7GuXgVRjp16iS73a6Kiop67RUVFYqKimp0m65duyooKKjeLZnrr79e5eXlqqmpUXBwcINtHA6HHA5Hg/agoCC+rAA/xfkN+J/mntNePdobHBys/v37q7Cw0NNWW1urwsJCJSYmNrrNb37zG+3Zs0e1tbWetm+++UZdu3ZtNIgAAIC2xev3jGRlZem1117TG2+8oa+//loPP/ywKisrPU/XjBkzpt4A14cfflg//fSTJkyYoG+++Ubvv/++ZsyYoUceeaTlPgUAAGi1vB4zMnLkSB0+fFhTpkxReXm5+vXrp3Xr1nkGtZaVlSkg4B8ZJzo6Wh9++KEef/xx3XjjjerevbsmTJigJ598suU+BQAAaLVslmVZpos4F6fTqYiICB07dowBrICfcblcWrt2rYYPH86YEcDPNPf3m7lpAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARp1XGFmwYIFiYmIUEhKihIQEbdmypVnbLVu2TDabTZmZmedzWAAA4Ie8DiPLly9XVlaWpk6dqm3btqlv375KT0/XoUOHzrrd/v37NXHiRCUlJZ13sQAAwP94HUbmzp2r8ePHa9y4cbrhhhuUl5ensLAwLVq0qMlt3G637rvvPk2bNk1XX331BRUMAAD8S6A3nWtqalRSUqLs7GxPW0BAgFJSUlRcXNzkds8++6y6dOmiBx54QEVFRec8TnV1taqrqz3LTqdTkuRyueRyubwpGcAlru6c5twG/E9zz2uvwsiRI0fkdrsVGRlZrz0yMlI7d+5sdJtPPvlEf/7zn1VaWtrs4+Tk5GjatGkN2tevX6+wsDBvSgbQShQUFJguAUALq6qqalY/r8KIt44fP67Ro0frtddeU6dOnZq9XXZ2trKysjzLTqdT0dHRSktLU3h4+MUoFYAhLpdLBQUFSk1NVVBQkOlyALSgujsb5+JVGOnUqZPsdrsqKirqtVdUVCgqKqpB/2+//Vb79+9XRkaGp622tvb0gQMDtWvXLl1zzTUNtnM4HHI4HA3ag4KC+LIC/BTnN+B/mntOezWANTg4WP3791dhYaGnrba2VoWFhUpMTGzQv3fv3tq+fbtKS0s9/377299qyJAhKi0tVXR0tDeHBwAAfsjr2zRZWVkaO3asBgwYoIEDB2revHmqrKzUuHHjJEljxoxR9+7dlZOTo5CQEPXp06fe9u3bt5ekBu0AAKBt8jqMjBw5UocPH9aUKVNUXl6ufv36ad26dZ5BrWVlZQoI4MWuAACgeWyWZVmmizgXp9OpiIgIHTt2jAGsgJ9xuVxau3athg8fzpgRwM809/ebSxgAAMAowggAADCKMAIAAIwijAAAAKMIIwCMcbvd2rhxozZt2qSNGzfK7XabLgmAAYQRAEasXLlSsbGxSk1N1dy5c5WamqrY2FitXLnSdGkAfIwwAsDnVq5cqbvuuktxcXEqKirS0qVLVVRUpLi4ON11110EEqCN4T0jAHzK7XYrNjZWcXFxys/Pl9vt9rxnxG63KzMzUzt27NDu3btlt9tNlwvgAvCeEQCXpKKiIu3fv1+TJk1q8LbmgIAAZWdna9++fSoqKjJUIQBfI4wA8KmDBw9Kanp+qrr2un4A/B9hBIBPde3aVZK0Y8eORtfXtdf1A+D/CCMAfCopKUkxMTGaMWOGamtr662rra1VTk6OevbsqaSkJEMVAvA1wggAn7Lb7ZozZ47WrFmjzMxMffbZZzp58qQ+++wzZWZmas2aNcrNzWXwKtCGBJouAEDbM2LECK1YsUJPPPGEbrnlFk97z549tWLFCo0YMcJgdQB8jUd7ARjjdrv18ccf64MPPtCwYcM0ZMgQrogAfqS5v99cGQFgjN1u1+DBg1VZWanBgwcTRIA2ijEjAADAKMIIAAAwijACwBhm7QUgEUYAGMKsvQDqEEYA+Byz9gI4E4/2AvApZu0F2g5m7QVwSWLWXgC/RBgB4FPM2gvglwgjAHyKWXsB/BJhBIBPMWsvgF8ijADwKWbtBfBLzE0DwOeYtRfAmXi0F4AxzNoL+Ddm7QVwyWPWXgASY0YAGMTcNAAkwggAQ5ibBkAdwggAn6ubm6ZPnz566aWX9Oijj+qll15Snz59mJsGaIMYwArAp+rmpunUqZMOHz6s7777zrPuqquuUufOnfXjjz8yNw3gBxjACuCSVDc3zf79+xUaGlpv3aFDhzzhpKioSMnJyQYqBOBr3KYB4FMHDhzw/D106FAVFRVp6dKlKioq0tChQxvtB8C/nVcYWbBggWJiYhQSEqKEhARt2bKlyb6vvfaakpKS1KFDB3Xo0EEpKSln7Q/Av1VUVEiS+vbtq/fee08JCQkKDQ1VQkKC3nvvPd144431+gHwf16HkeXLlysrK0tTp07Vtm3b1LdvX6Wnp+vQoUON9t+wYYPuvfdeffzxxyouLlZ0dLTS0tL4rx6gjfrxxx8lqcEtmjphYWH1+gHwf16Hkblz52r8+PEaN26cbrjhBuXl5SksLEyLFi1qtP+bb76pP/zhD+rXr5969+6tP/3pT6qtrVVhYeEFFw+g9QkIOP21UzcXzS/npvnss8/q9QPg/7wawFpTU6OSkhJlZ2d72gICApSSkqLi4uJm7aOqqkoul0sdO3Zssk91dbWqq6s9y06nU5Lkcrnkcrm8KRnAJaZuNt5evXrpq6++qjc3TUxMjHr16qVdu3YpKSmJ8x1o5Zp7DnsVRo4cOSK3263IyMh67ZGRkdq5c2ez9vHkk0+qW7duSklJabJPTk6Opk2b1qB9/fr1nku4AFont9utiIgI7dq1S/3791daWpocDoeqq6u1bds2lZSUKCIiQidOnNDatWtNlwvgAlRVVTWrn08f7Z05c6aWLVumDRs2KCQkpMl+2dnZysrK8iw7nU7PWBPeMwK0fgsXLtTvf/97/e1vf1NJSYmnPSwsTDabTQsXLlRGRobBCgG0hLo7G+fiVRjp1KmT7HZ7g1HuFRUVioqKOuu2ubm5mjlzpv7nf/7HM1q+KQ6HQw6Ho0F7UFCQgoKCvCkZwCXonnvuUWBgoJ544gnt37/f0x4ZGanc3FyNGDHCXHEAWkxzf7O9GiEWHBys/v371xt8WjcYNTExscntZs2apenTp2vdunUaMGCAN4cE4KdGjBihXbt2KTc3V8OHD1dubq527txJEAHaIK+Hq2dlZem1117TG2+8oa+//loPP/ywKisrNW7cOEnSmDFj6g1wfeGFFzR58mQtWrRIMTExKi8vV3l5uU6cONFynwJAq7Ny5Ur16tVLEydO1Nq1azVx4kT16tWLeWmANsjrMDJy5Ejl5uZqypQp6tevn0pLS7Vu3TrPoNaysjIdPHjQ0/8//uM/VFNTo7vuuktdu3b1/MvNzW25TwGgVambKC8uLq7eG1jj4uKYKA9og5goD4BP1U2UFxcXp/z8fLndbq1du1bDhw+X3W5XZmamduzYwUR5gB9o7u83bxUC4FN1E+VNmjRJlmVp48aN2rRpkzZu3CjLspSdna19+/apqKjIdKkAfIQwAsCn6m7jfvvtt4qNjVVqaqrmzp2r1NRUxcbGau/evfX6AfB/hBEAPtW1a1dJ0ujRoxsdMzJ69Oh6/QD4P8aMAPCpmpoaXXbZZbriiiv0/fffy7Isz5gRm82mK6+8Uj/++KMqKysVHBxsulwAF6C5v98+fQMrAGzevFmnTp1SRUWFfve73yk1NVW7d+/Wd999p4KCAs9LFTdv3qzk5GSzxQLwCcIIAJ+qGwsyYcIEvfLKK1qzZo1nnd1u14QJEzR//nzGjABtCGNGAPhU3ViQ+fPnN7gNExwcrPnz59frB8D/EUYA+NSgQYMUEHD6q2fo0KH1BrAOHTpUkhQQEKBBgwaZLBOADxFGAPhUUVGRamtrJUmWZWnbtm369NNPtW3bNtWNp6+treU9I0AbwpgRAD61YcMGSadn7n333Xf1/vvve9bZ7Xbdc889evvtt7VhwwbPlRIA/o0rIwCMePvttxsdM/L2228bqgiAKVwZAeBTSUlJnr9vvfVWpaen65tvvtF1112nDz/80HOl5Mx+APwbYQSAMYWFhfVu04SEhBisBoAp3KYB4FNnDkz9+eef6607c5kBrEDbQRgB4FN1T9JIDa+EnLl8Zj8A/o0wAsCn2rdvL+l08OjUqVO9dZ06dfIEkrp+APwfY0YA+NTRo0clnb4l8/3339dbd+ZyXT8A/o8rIwAAwCjCCACfuvzyyz1/d+/evd66K6+8stF+APwbYQSAT3300Ueev48cOVJv3eHDhxvtB8C/EUYA+NSZY0Gqq6vrrTtzmTEjQNtBGAHgU7/61a9atB+A1o8wAsCn7rjjjhbtB6D1I4wA8Klly5a1aD8ArR9hBIBP7du3r0X7AWj9eOkZAJ86efKk5+/OnTvrhhtu0OHDh9W5c2f97W9/8zxRc2Y/AP6NMALAp858YmbAgAHKzs7WgQMH1L17d+Xk5OiDDz5o0A+AfyOMAPCpM694rFu3zhM+JCkgIKDRfgD8G2EEgNeqqqq0c+fO89o2PDzc87dlWfXWnTlTb3h4uLZt2+b1/nv37q2wsLDzqg2AGYQRAF7buXOn+vfvf1GP8dVXX53XMUpKSnhHCdDKEEYAeK13794qKSk5r21ramo0aNCgBldFzmSz2bR582YFBwefV20AWhfCCACvhYWFXdDVh4kTJ2r27NlnXX/TTTed9/4BtC6EEQA+N2vWLEnS3Llz5Xa7Pe2BgYF6/PHHPesBtA0262zXSi8RTqdTEREROnbsWL3BbwBat5qaGv3xuVn6z/c/1/93e4Kef/rfz+vWDIBLU3N/v7kyArQx+45UqrL6lOkyPG767Ri9UxOvm34bp28On5R0aTzSe5kjUD07XWa6DKBNOK8wsmDBAs2ePVvl5eXq27evXn75ZQ0cOLDJ/u+8844mT56s/fv369prr9ULL7yg4cOHn3fRAM7PviOVGpK7wXQZjXpixXbTJTTw8cRkAgngA16HkeXLlysrK0t5eXlKSEjQvHnzlJ6erl27dqlLly4N+m/evFn33nuvcnJydMcdd+itt95SZmamtm3bpj59+rTIhwDQPHVXROaN7KfYLpcbrua0ypPVWrOhWHckJ+qyUIfpciRJew6d0GPLSy+pK0iAP/M6jMydO1fjx4/XuHHjJEl5eXl6//33tWjRIj311FMN+s+fP1+33Xab/u3f/k2SNH36dBUUFOiVV15RXl7eBZYPwFu2QKfsIQcUEHJphJHQwFPq1uEHhbYrV0DgpXHn2B5yQrZAp+kygDbDqzO/pqZGJSUlys7O9rQFBAQoJSVFxcXFjW5TXFysrKysem3p6enKz89v8jjV1dX15qVwOk9/KbhcLrlcLm9KBnCG4yerFdT+c03aMsN0KQ28uu5V0yXUE9R+qE6dSuM7B7gAzT1/vAojR44ckdvtVmRkZL32yMjIJl8NXV5e3mj/8vLyJo+Tk5OjadOmNWhfv349r3kGLkBxhU2uowk6deKGC9pPzZG/68c1uS1UVcu64o6JCu4UfcH7sU610/8Wf6LvQlugKKCNqqqqala/S+Oa6C9kZ2fXu5ridDoVHR2ttLQ0Hu0FLsBNlTWK+/qQru58mUKD7Oe9n5Mnq7R/2JAWqcl9yq3t27crLi5O9sDzr6lOzDXXKjT0wv+j5TKHXTFXMHgVuBB1dzbOxasw0qlTJ9ntdlVUVNRrr6ioUFRUVKPbREVFedVfkhwOhxyOhgPZgoKCFBQU5E3JAM4Q2T5I9yX2bIE9XaHE3hd+9UE6fRm3nao0fHgy5zfgZ5p7Tgecu8s/BAcHq3///iosLPS01dbWqrCwUImJiY1uk5iYWK+/JBUUFDTZHwAAtC1e36bJysrS2LFjNWDAAA0cOFDz5s1TZWWl5+maMWPGqHv37srJyZEkTZgwQYMHD9acOXN0++23a9myZdq6dasWLlzYsp8EAAC0Sl6HkZEjR+rw4cOaMmWKysvL1a9fP61bt84zSLWsrEwBAf+44DJo0CC99dZbevrppzVp0iRde+21ys/P5x0jAABAEnPTADDM5XJp7dq1Gj58OGNGAD/T3N9vr8aMAAAAtDTCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAor18Hb0LdS2KbOxUxgNbD5XKpqqpKTqeTN7ACfqbud/tcL3tvFWHk+PHjkqTo6JaZshwAAPjO8ePHFRER0eT6VjE3TW1trX744Qe1a9dONpvNdDkAWpDT6VR0dLT+/ve/M/cU4Gcsy9Lx48fVrVu3epPo/lKrCCMA/BcTYQJgACsAADCKMAIAAIwijAAwyuFwaOrUqXI4HKZLAWAIY0YAAIBRXBkBAABGEUYAAIBRhBEAAGAUYQSA15KTk/XYY49dMvsB0LoRRoBW5v7775fNZpPNZlNwcLBiY2P17LPP6tSpU6ZLa9KGDRtks9l09OjReu0rV67U9OnTW/RYr732mpKSktShQwd16NBBKSkp2rJlS70+lmVpypQp6tq1q0JDQ5WSkqLdu3d71u/fv18PPPCAevbsqdDQUF1zzTWaOnWqampq6u3nq6++UlJSkkJCQhQdHa1Zs2a16GcB2grCCNAK3XbbbTp48KB2796tJ554Qs8884xmz55tuiyvdezYUe3atWvRfW7YsEH33nuvPv74YxUXFys6OlppaWk6cOCAp8+sWbP00ksvKS8vT59//rkuu+wypaen6+eff5Yk7dy5U7W1tfrP//xP/fWvf9WLL76ovLw8TZo0ybMPp9OptLQ0XXXVVSopKdHs2bP1zDPPaOHChS36eYA2wQLQqowdO9a6884767WlpqZaN910k/XTTz9Zo0ePttq3b2+FhoZat912m/XNN994+i1evNiKiIiwVq1aZcXGxloOh8NKS0uzysrKzrr/CRMmWIMHD/YsDx482JowYYJnecmSJVb//v2tyy+/3IqMjLTuvfdeq6KiwrIsy9q3b58lqd6/sWPHNrqf5ta/bt06q3fv3tZll11mpaenWz/88EOT/3udOnXKateunfXGG29YlmVZtbW1VlRUlDV79mxPn6NHj1oOh8NaunRpk/uZNWuW1bNnT8/yq6++anXo0MGqrq72tD355JNWr169mtwHgMZxZQTwA6GhoaqpqdH999+vrVu3avXq1SouLpZlWRo+fLhcLpenb1VVlZ5//nktWbJEn376qY4eParf//73F3R8l8ul6dOn68svv1R+fr7279+v+++/X9Lp2bbfffddSdKuXbt08OBBzZ8/v9H9NLf+3Nxc/dd//Zc2bdqksrIyTZw4scnaqqqq5HK51LFjR0nSvn37VF5erpSUFE+fiIgIJSQkqLi4uMn9HDt2zLMPSSouLtYtt9yi4OBgT1t6erp27dql//u//zvL/1oAfinQdAEAzp9lWSosLNSHH36oYcOGKT8/X59++qkGDRokSXrzzTcVHR2t/Px83X333ZJOB4dXXnlFCQkJkqQ33nhD119/vbZs2aKBAweeVx3//M//7Pn76quv1ksvvaRf//rXOnHihC6//HLPj3iXLl3Uvn37Rvexe/durV69uln15+Xl6ZprrpEkPfroo3r22WebrO3JJ59Ut27dPOGjvLxckhQZGVmvX2RkpGfdL+3Zs0cvv/yycnNzPW3l5eXq2bNng33UrevQoUOTNQGojysjQCu0Zs0aXX755QoJCdGwYcM0cuRI3X///QoMDPSEDEm64oor1KtXL3399deetsDAQP3617/2LPfu3Vvt27ev18dbJSUlysjIUI8ePdSuXTsNHjxYklRWVtbsfXz99dfNqj8sLMwTRCSpa9euOnToUKP7nDlzppYtW6ZVq1YpJCTE248lSTpw4IBuu+023X333Ro/fvx57QPA2RFGgFZoyJAhKi0t1e7du3Xy5Em98cYbstlsLbLvgIAAWb+YJeLM2yS/VFlZqfT0dIWHh+vNN9/U//7v/2rVqlWS1ODpk5YQFBRUb9lmszWoV5Jyc3M1c+ZMrV+/XjfeeKOnPSoqSpJUUVFRr39FRYVnXZ0ffvhBQ4YM0aBBgxoMTI2Kimp0H2ceA0DzEEaAVuiyyy5TbGysevToocDA03dbr7/+ep06dUqff/65p9+PP/6oXbt26YYbbvC0nTp1Slu3bvUs79q1S0ePHtX1118vSercubMOHjxY73ilpaVN1rJz5079+OOPmjlzppKSktS7d+8GVyrqxlW43e4m99Pc+ptj1qxZmj59utatW6cBAwbUW9ezZ09FRUWpsLDQ0+Z0OvX5558rMTHR03bgwAElJyerf//+Wrx4sQIC6n9dJiYmatOmTfWCWkFBgXr16sUtGsBLhBHAT1x77bW68847NX78eH3yySf68ssv9U//9E/q3r277rzzTk+/oKAg/eu//qs+//xzlZSU6P7779dNN93kGS9y6623auvWrVqyZIl2796tqVOnaseOHU0et0ePHgoODtbLL7+svXv3avXq1Q3eHXLVVVfJZrNpzZo1Onz4sE6cOHHe9Z/LCy+8oMmTJ2vRokWKiYlReXm5ysvLPce02Wx67LHH9Nxzz2n16tXavn27xowZo27duikzM1PSP4JIjx49lJubq8OHD3v2U2fUqFEKDg7WAw88oL/+9a9avny55s+fr6ysrGbXCuD/MfosDwCvNfbobZ26R2MjIiKs0NBQKz09vdFHY999913r6quvthwOh5WSkmJ999139fYzZcoUKzIy0oqIiLAef/xx69FHHz3ro71vvfWWFRMTYzkcDisxMdFavXq1Jcn64osvPH2effZZKyoqyrLZbOd8tPdc9Z9p1apV1plfZVdddVWDR4klWVOnTvX0qa2ttSZPnmxFRkZaDofDGjp0qLVr1656x2lsH7/8yvzyyy+tm2++2XI4HFb37t2tmTNnNvr/C4Czs1lWIzdbAfil119/XY899liDN6ECgEncpgEAAEYRRgAAgFHcpgEAAEZxZQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAY9f8DA3dqP7P3BxYAAAAASUVORK5CYII=",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries.boxplot(column=\"Population2020\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отсечение данных для признака Население, значение которых больше 50000000\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationClip</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationClip\n",
|
||
"no \n",
|
||
"1 China 1439323776 50000000\n",
|
||
"2 India 1380004385 50000000\n",
|
||
"3 United States 331002651 50000000\n",
|
||
"4 Indonesia 273523615 50000000\n",
|
||
"5 Pakistan 220892340 50000000\n",
|
||
"6 Brazil 212559417 50000000\n",
|
||
"7 Nigeria 206139589 50000000\n",
|
||
"8 Bangladesh 164689383 50000000\n",
|
||
"9 Russia 145934462 50000000\n",
|
||
"10 Mexico 128932753 50000000\n",
|
||
"11 Japan 126476461 50000000\n",
|
||
"12 Ethiopia 114963588 50000000\n",
|
||
"13 Philippines 109581078 50000000\n",
|
||
"14 Egypt 102334404 50000000\n",
|
||
"15 Vietnam 97338579 50000000\n",
|
||
"16 DR Congo 89561403 50000000\n",
|
||
"17 Turkey 84339067 50000000\n",
|
||
"18 Iran 83992949 50000000\n",
|
||
"19 Germany 83783942 50000000\n",
|
||
"20 Thailand 69799978 50000000\n",
|
||
"21 United Kingdom 67886011 50000000\n",
|
||
"22 France 65273511 50000000\n",
|
||
"23 Italy 60461826 50000000\n",
|
||
"24 Tanzania 59734218 50000000\n",
|
||
"25 South Africa 59308690 50000000\n",
|
||
"26 Myanmar 54409800 50000000\n",
|
||
"27 Kenya 53771296 50000000\n",
|
||
"28 South Korea 51269185 50000000\n",
|
||
"29 Colombia 50882891 50000000"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries_norm = countries.copy()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000)\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
|
||
" [\"Country\", \"Population2020\", \"PopulationClip\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Винсоризация признака Возраст"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"111195830.99999991\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationWinsorized</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationWinsorized\n",
|
||
"no \n",
|
||
"1 China 1439323776 114963588\n",
|
||
"2 India 1380004385 114963588\n",
|
||
"3 United States 331002651 114963588\n",
|
||
"4 Indonesia 273523615 114963588\n",
|
||
"5 Pakistan 220892340 114963588\n",
|
||
"6 Brazil 212559417 114963588\n",
|
||
"7 Nigeria 206139589 114963588\n",
|
||
"8 Bangladesh 164689383 114963588\n",
|
||
"9 Russia 145934462 114963588\n",
|
||
"10 Mexico 128932753 114963588\n",
|
||
"11 Japan 126476461 114963588\n",
|
||
"12 Ethiopia 114963588 114963588\n",
|
||
"13 Philippines 109581078 109581078\n",
|
||
"14 Egypt 102334404 102334404\n",
|
||
"15 Vietnam 97338579 97338579\n",
|
||
"16 DR Congo 89561403 89561403\n",
|
||
"17 Turkey 84339067 84339067\n",
|
||
"18 Iran 83992949 83992949\n",
|
||
"19 Germany 83783942 83783942\n",
|
||
"20 Thailand 69799978 69799978\n",
|
||
"21 United Kingdom 67886011 67886011\n",
|
||
"22 France 65273511 65273511\n",
|
||
"23 Italy 60461826 60461826\n",
|
||
"24 Tanzania 59734218 59734218\n",
|
||
"25 South Africa 59308690 59308690\n",
|
||
"26 Myanmar 54409800 54409800\n",
|
||
"27 Kenya 53771296 53771296\n",
|
||
"28 South Korea 51269185 51269185\n",
|
||
"29 Colombia 50882891 50882891"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.stats.mstats import winsorize\n",
|
||
"\n",
|
||
"print(countries_norm[\"Population2020\"].quantile(q=0.95))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
|
||
" countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n",
|
||
" (0, 0.05),\n",
|
||
" inplace=False,\n",
|
||
")\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
|
||
" [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Нормализация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationNorm</th>\n",
|
||
" <th>PopulationClipNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm2</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>1.000000e+00</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.587866e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.299705e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.900357e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.534691e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>2.911786e-06</td>\n",
|
||
" <td>0.000084</td>\n",
|
||
" <td>0.000036</td>\n",
|
||
" <td>-0.999927</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>1.861292e-06</td>\n",
|
||
" <td>0.000054</td>\n",
|
||
" <td>0.000023</td>\n",
|
||
" <td>-0.999953</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>5.731862e-07</td>\n",
|
||
" <td>0.000017</td>\n",
|
||
" <td>0.000007</td>\n",
|
||
" <td>-0.999986</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>3.862927e-07</td>\n",
|
||
" <td>0.000011</td>\n",
|
||
" <td>0.000005</td>\n",
|
||
" <td>-0.999990</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationNorm PopulationClipNorm \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 1.000000e+00 1.000000 \n",
|
||
"2 India 1380004385 9.587866e-01 1.000000 \n",
|
||
"3 United States 331002651 2.299705e-01 1.000000 \n",
|
||
"4 Indonesia 273523615 1.900357e-01 1.000000 \n",
|
||
"5 Pakistan 220892340 1.534691e-01 1.000000 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 2.911786e-06 0.000084 \n",
|
||
"232 Falkland Islands 3480 1.861292e-06 0.000054 \n",
|
||
"233 Niue 1626 5.731862e-07 0.000017 \n",
|
||
"234 Tokelau 1357 3.862927e-07 0.000011 \n",
|
||
"235 Holy See 801 0.000000e+00 0.000000 \n",
|
||
"\n",
|
||
" PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
|
||
"no \n",
|
||
"1 1.000000 1.000000 \n",
|
||
"2 1.000000 1.000000 \n",
|
||
"3 1.000000 1.000000 \n",
|
||
"4 1.000000 1.000000 \n",
|
||
"5 1.000000 1.000000 \n",
|
||
".. ... ... \n",
|
||
"231 0.000036 -0.999927 \n",
|
||
"232 0.000023 -0.999953 \n",
|
||
"233 0.000007 -0.999986 \n",
|
||
"234 0.000005 -0.999990 \n",
|
||
"235 0.000000 -1.000000 \n",
|
||
"\n",
|
||
"[235 rows x 6 columns]"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
||
"\n",
|
||
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country\",\n",
|
||
" \"Population2020\",\n",
|
||
" \"PopulationNorm\",\n",
|
||
" \"PopulationClipNorm\",\n",
|
||
" \"PopulationWinsorizedNorm\",\n",
|
||
" \"PopulationWinsorizedNorm2\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Стандартизация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationStand</th>\n",
|
||
" <th>PopulationClipStand</th>\n",
|
||
" <th>PopulationWinsorizedStand</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>10.427597</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.987702</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.208627</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.782380</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.392082</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>-0.245950</td>\n",
|
||
" <td>-0.795071</td>\n",
|
||
" <td>-0.621969</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>-0.245962</td>\n",
|
||
" <td>-0.795158</td>\n",
|
||
" <td>-0.622019</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>-0.245975</td>\n",
|
||
" <td>-0.795265</td>\n",
|
||
" <td>-0.622080</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>-0.245977</td>\n",
|
||
" <td>-0.795280</td>\n",
|
||
" <td>-0.622089</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>-0.245982</td>\n",
|
||
" <td>-0.795312</td>\n",
|
||
" <td>-0.622107</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationStand PopulationClipStand \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 10.427597 2.073933 \n",
|
||
"2 India 1380004385 9.987702 2.073933 \n",
|
||
"3 United States 331002651 2.208627 2.073933 \n",
|
||
"4 Indonesia 273523615 1.782380 2.073933 \n",
|
||
"5 Pakistan 220892340 1.392082 2.073933 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 -0.245950 -0.795071 \n",
|
||
"232 Falkland Islands 3480 -0.245962 -0.795158 \n",
|
||
"233 Niue 1626 -0.245975 -0.795265 \n",
|
||
"234 Tokelau 1357 -0.245977 -0.795280 \n",
|
||
"235 Holy See 801 -0.245982 -0.795312 \n",
|
||
"\n",
|
||
" PopulationWinsorizedStand \n",
|
||
"no \n",
|
||
"1 3.171659 \n",
|
||
"2 3.171659 \n",
|
||
"3 3.171659 \n",
|
||
"4 3.171659 \n",
|
||
"5 3.171659 \n",
|
||
".. ... \n",
|
||
"231 -0.621969 \n",
|
||
"232 -0.622019 \n",
|
||
"233 -0.622080 \n",
|
||
"234 -0.622089 \n",
|
||
"235 -0.622107 \n",
|
||
"\n",
|
||
"[235 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"stndart_scaler = preprocessing.StandardScaler()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country\",\n",
|
||
" \"Population2020\",\n",
|
||
" \"PopulationStand\",\n",
|
||
" \"PopulationClipStand\",\n",
|
||
" \"PopulationWinsorizedStand\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|