3468 lines
123 KiB
Plaintext
3468 lines
123 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование\n",
|
||
"\n",
|
||
"Преобразование категориального признака в несколько бинарных признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка набора данных World Population"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>Yearly</th>\n",
|
||
" <th>NetChange</th>\n",
|
||
" <th>Density</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>Migrants</th>\n",
|
||
" <th>FertRate</th>\n",
|
||
" <th>MedAge</th>\n",
|
||
" <th>UrbanPop</th>\n",
|
||
" <th>WorldShare</th>\n",
|
||
" <th>Net Change</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5,540,090</td>\n",
|
||
" <td>153</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>-348,399</td>\n",
|
||
" <td>1.7</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>61%</td>\n",
|
||
" <td>18.47%</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13,586,631</td>\n",
|
||
" <td>464</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>-532,687</td>\n",
|
||
" <td>2.2</td>\n",
|
||
" <td>28</td>\n",
|
||
" <td>35%</td>\n",
|
||
" <td>17.70%</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1,937,734</td>\n",
|
||
" <td>36</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>954,806</td>\n",
|
||
" <td>1.8</td>\n",
|
||
" <td>38</td>\n",
|
||
" <td>83%</td>\n",
|
||
" <td>4.25%</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2,898,047</td>\n",
|
||
" <td>151</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>-98,955</td>\n",
|
||
" <td>2.3</td>\n",
|
||
" <td>30</td>\n",
|
||
" <td>56%</td>\n",
|
||
" <td>3.51%</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4,327,022</td>\n",
|
||
" <td>287</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>-233,379</td>\n",
|
||
" <td>3.6</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>35%</td>\n",
|
||
" <td>2.83%</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>10%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>66%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>103</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>46%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>136</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>0%</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2,003</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>N.A.</td>\n",
|
||
" <td>0.00%</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 12 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 Yearly NetChange Density LandArea \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5,540,090 153 9388211 \n",
|
||
"2 India 1380004385 0.99 13,586,631 464 2973190 \n",
|
||
"3 United States 331002651 0.59 1,937,734 36 9147420 \n",
|
||
"4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n",
|
||
"5 Pakistan 220892340 2.00 4,327,022 287 770880 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 50 100 \n",
|
||
"232 Falkland Islands 3480 3.05 103 0 12170 \n",
|
||
"233 Niue 1626 0.68 11 6 260 \n",
|
||
"234 Tokelau 1357 1.27 17 136 10 \n",
|
||
"235 Holy See 801 0.25 2 2,003 0 \n",
|
||
"\n",
|
||
" Migrants FertRate MedAge UrbanPop WorldShare Net Change \n",
|
||
"no \n",
|
||
"1 -348,399 1.7 38 61% 18.47% 5540090 \n",
|
||
"2 -532,687 2.2 28 35% 17.70% 13586631 \n",
|
||
"3 954,806 1.8 38 83% 4.25% 1937734 \n",
|
||
"4 -98,955 2.3 30 56% 3.51% 2898047 \n",
|
||
"5 -233,379 3.6 23 35% 2.83% 4327022 \n",
|
||
".. ... ... ... ... ... ... \n",
|
||
"231 NaN N.A. N.A. 10% 0.00% 3 \n",
|
||
"232 NaN N.A. N.A. 66% 0.00% 103 \n",
|
||
"233 NaN N.A. N.A. 46% 0.00% 11 \n",
|
||
"234 NaN N.A. N.A. 0% 0.00% 17 \n",
|
||
"235 NaN N.A. N.A. N.A. 0.00% 2 \n",
|
||
"\n",
|
||
"[235 rows x 12 columns]"
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"countries = pd.read_csv(\n",
|
||
" \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n",
|
||
")\n",
|
||
"\n",
|
||
"countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Кодирование"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
||
"\n",
|
||
"# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
||
"\n",
|
||
"# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"# encoded_values_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавление признаков в исходный Dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"# titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Дискретизация признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"num_bins = 3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0. , 5458956.66666667, 10917913.33333333,\n",
|
||
" 16376870. ]),\n",
|
||
" array([229, 5, 1]))"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"hist1, bins1 = np.histogram(\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n",
|
||
")\n",
|
||
"bins1, hist1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(5458956.667, 10917913.333]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>(10917913.333, 16376870.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(0.0, 5458956.667]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (5458956.667, 10917913.333]\n",
|
||
"2 2973190 (0.0, 5458956.667]\n",
|
||
"3 9147420 (5458956.667, 10917913.333]\n",
|
||
"4 1811570 (0.0, 5458956.667]\n",
|
||
"5 770880 (0.0, 5458956.667]\n",
|
||
"6 8358140 (5458956.667, 10917913.333]\n",
|
||
"7 910770 (0.0, 5458956.667]\n",
|
||
"8 130170 (0.0, 5458956.667]\n",
|
||
"9 16376870 (10917913.333, 16376870.0]\n",
|
||
"10 1943950 (0.0, 5458956.667]\n",
|
||
"11 364555 (0.0, 5458956.667]\n",
|
||
"12 1000000 (0.0, 5458956.667]\n",
|
||
"13 298170 (0.0, 5458956.667]\n",
|
||
"14 995450 (0.0, 5458956.667]\n",
|
||
"15 310070 (0.0, 5458956.667]\n",
|
||
"16 2267050 (0.0, 5458956.667]\n",
|
||
"17 769630 (0.0, 5458956.667]\n",
|
||
"18 1628550 (0.0, 5458956.667]\n",
|
||
"19 348560 (0.0, 5458956.667]\n",
|
||
"20 510890 (0.0, 5458956.667]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Middle\n",
|
||
"2 2973190 Small\n",
|
||
"3 9147420 Middle\n",
|
||
"4 1811570 Small\n",
|
||
"5 770880 Small\n",
|
||
"6 8358140 Middle\n",
|
||
"7 910770 Small\n",
|
||
"8 130170 Small\n",
|
||
"9 16376870 Big\n",
|
||
"10 1943950 Small\n",
|
||
"11 364555 Small\n",
|
||
"12 1000000 Small\n",
|
||
"13 298170 Small\n",
|
||
"14 995450 Small\n",
|
||
"15 310070 Small\n",
|
||
"16 2267050 Small\n",
|
||
"17 769630 Small\n",
|
||
"18 1628550 Small\n",
|
||
"19 348560 Small\n",
|
||
"20 510890 Small"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([ 0., 4000000., 8000000., 12000000.]),\n",
|
||
" array([229, 1, 4, 1]))"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"bins2 = np.linspace(0, 12000000, 4)\n",
|
||
"\n",
|
||
"tmp_bins2 = np.digitize(\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n",
|
||
")\n",
|
||
"\n",
|
||
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
||
"\n",
|
||
"bins2, hist2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(8000000.0, 12000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(0.0, 4000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (8000000.0, 12000000.0]\n",
|
||
"2 2973190 (0.0, 4000000.0]\n",
|
||
"3 9147420 (8000000.0, 12000000.0]\n",
|
||
"4 1811570 (0.0, 4000000.0]\n",
|
||
"5 770880 (0.0, 4000000.0]\n",
|
||
"6 8358140 (8000000.0, 12000000.0]\n",
|
||
"7 910770 (0.0, 4000000.0]\n",
|
||
"8 130170 (0.0, 4000000.0]\n",
|
||
"9 16376870 NaN\n",
|
||
"10 1943950 (0.0, 4000000.0]\n",
|
||
"11 364555 (0.0, 4000000.0]\n",
|
||
"12 1000000 (0.0, 4000000.0]\n",
|
||
"13 298170 (0.0, 4000000.0]\n",
|
||
"14 995450 (0.0, 4000000.0]\n",
|
||
"15 310070 (0.0, 4000000.0]\n",
|
||
"16 2267050 (0.0, 4000000.0]\n",
|
||
"17 769630 (0.0, 4000000.0]\n",
|
||
"18 1628550 (0.0, 4000000.0]\n",
|
||
"19 348560 (0.0, 4000000.0]\n",
|
||
"20 510890 (0.0, 4000000.0]"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Small</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Big\n",
|
||
"2 2973190 Small\n",
|
||
"3 9147420 Big\n",
|
||
"4 1811570 Small\n",
|
||
"5 770880 Small\n",
|
||
"6 8358140 Big\n",
|
||
"7 910770 Small\n",
|
||
"8 130170 Small\n",
|
||
"9 16376870 NaN\n",
|
||
"10 1943950 Small\n",
|
||
"11 364555 Small\n",
|
||
"12 1000000 Small\n",
|
||
"13 298170 Small\n",
|
||
"14 995450 Small\n",
|
||
"15 310070 Small\n",
|
||
"16 2267050 Small\n",
|
||
"17 769630 Small\n",
|
||
"18 1628550 Small\n",
|
||
"19 348560 Small\n",
|
||
"20 510890 Small"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n",
|
||
" array([52, 77, 56, 44, 6]))"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
|
||
"hist3, bins3 = np.histogram(\n",
|
||
"\n",
|
||
" countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"bins3, hist3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>(3000000.0, inf]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>(100000.0, 500000.0]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>(500000.0, 3000000.0]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 (3000000.0, inf]\n",
|
||
"2 2973190 (500000.0, 3000000.0]\n",
|
||
"3 9147420 (3000000.0, inf]\n",
|
||
"4 1811570 (500000.0, 3000000.0]\n",
|
||
"5 770880 (500000.0, 3000000.0]\n",
|
||
"6 8358140 (3000000.0, inf]\n",
|
||
"7 910770 (500000.0, 3000000.0]\n",
|
||
"8 130170 (100000.0, 500000.0]\n",
|
||
"9 16376870 (3000000.0, inf]\n",
|
||
"10 1943950 (500000.0, 3000000.0]\n",
|
||
"11 364555 (100000.0, 500000.0]\n",
|
||
"12 1000000 (500000.0, 3000000.0]\n",
|
||
"13 298170 (100000.0, 500000.0]\n",
|
||
"14 995450 (500000.0, 3000000.0]\n",
|
||
"15 310070 (100000.0, 500000.0]\n",
|
||
"16 2267050 (500000.0, 3000000.0]\n",
|
||
"17 769630 (500000.0, 3000000.0]\n",
|
||
"18 1628550 (500000.0, 3000000.0]\n",
|
||
"19 348560 (100000.0, 500000.0]\n",
|
||
"20 510890 (500000.0, 3000000.0]"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Giant\n",
|
||
"2 2973190 Big\n",
|
||
"3 9147420 Giant\n",
|
||
"4 1811570 Big\n",
|
||
"5 770880 Big\n",
|
||
"6 8358140 Giant\n",
|
||
"7 910770 Big\n",
|
||
"8 130170 Middle\n",
|
||
"9 16376870 Giant\n",
|
||
"10 1943950 Big\n",
|
||
"11 364555 Middle\n",
|
||
"12 1000000 Big\n",
|
||
"13 298170 Middle\n",
|
||
"14 995450 Big\n",
|
||
"15 310070 Middle\n",
|
||
"16 2267050 Big\n",
|
||
"17 769630 Big\n",
|
||
"18 1628550 Big\n",
|
||
"19 348560 Middle\n",
|
||
"20 510890 Big"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Квантильное разделение данных на 5 групп\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 4\n",
|
||
"2 2973190 4\n",
|
||
"3 9147420 4\n",
|
||
"4 1811570 4\n",
|
||
"5 770880 4\n",
|
||
"6 8358140 4\n",
|
||
"7 910770 4\n",
|
||
"8 130170 2\n",
|
||
"9 16376870 4\n",
|
||
"10 1943950 4\n",
|
||
"11 364555 3\n",
|
||
"12 1000000 4\n",
|
||
"13 298170 3\n",
|
||
"14 995450 4\n",
|
||
"15 310070 3\n",
|
||
"16 2267050 4\n",
|
||
"17 769630 4\n",
|
||
"18 1628550 4\n",
|
||
"19 348560 3\n",
|
||
"20 510890 3"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>8358140</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>910770</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>130170</td>\n",
|
||
" <td>Middle</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>16376870</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>1943950</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>364555</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>1000000</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>298170</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>995450</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>310070</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>2267050</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>769630</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>1628550</td>\n",
|
||
" <td>Giant</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>348560</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>510890</td>\n",
|
||
" <td>Big</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LandArea LandArea\n",
|
||
"no \n",
|
||
"1 9388211 Giant\n",
|
||
"2 2973190 Giant\n",
|
||
"3 9147420 Giant\n",
|
||
"4 1811570 Giant\n",
|
||
"5 770880 Giant\n",
|
||
"6 8358140 Giant\n",
|
||
"7 910770 Giant\n",
|
||
"8 130170 Middle\n",
|
||
"9 16376870 Giant\n",
|
||
"10 1943950 Giant\n",
|
||
"11 364555 Big\n",
|
||
"12 1000000 Giant\n",
|
||
"13 298170 Big\n",
|
||
"14 995450 Giant\n",
|
||
"15 310070 Big\n",
|
||
"16 2267050 Giant\n",
|
||
"17 769630 Giant\n",
|
||
"18 1628550 Giant\n",
|
||
"19 348560 Big\n",
|
||
"20 510890 Big"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример конструирования признаков на основе существующих\n",
|
||
"\n",
|
||
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
|
||
"\n",
|
||
"Is_married - замужняя ли женщина\n",
|
||
"\n",
|
||
"Cabin_type - палуба (тип каюты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic_cl = titanic.drop(\n",
|
||
"# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
|
||
"# )\n",
|
||
"# titanic_cl = titanic_cl.dropna()\n",
|
||
"\n",
|
||
"# titanic_cl[\"Title\"] = [\n",
|
||
"# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
|
||
"# ]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
|
||
"\n",
|
||
"# titanic_cl"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
||
"\n",
|
||
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1379: SyntaxWarning: invalid escape sequence '\\l'\n",
|
||
" columns_string = \"\\l\".join(column_typing_info) # noqa: W605\n",
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1381: SyntaxWarning: invalid escape sequence '\\l'\n",
|
||
" label = \"{%s (%d row%s)|%s\\l}\" % ( # noqa: W605\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"( no Country Population2020 Yearly NetChange Density \\\n",
|
||
" 0 1 China 1439323776 0.39 5540090 153 \n",
|
||
" 1 2 India 1380004385 0.99 13586631 464 \n",
|
||
" 2 3 United States 331002651 0.59 1937734 36 \n",
|
||
" 3 4 Indonesia 273523615 1.07 2898047 151 \n",
|
||
" 4 5 Pakistan 220892340 2.00 4327022 287 \n",
|
||
" .. ... ... ... ... ... ... \n",
|
||
" 230 231 Montserrat 4992 0.06 3 50 \n",
|
||
" 231 232 Falkland Islands 3480 3.05 103 0 \n",
|
||
" 232 233 Niue 1626 0.68 11 6 \n",
|
||
" 233 234 Tokelau 1357 1.27 17 136 \n",
|
||
" 234 235 Holy See 801 0.25 2 2,003 \n",
|
||
" \n",
|
||
" LandArea \n",
|
||
" 0 9388211 \n",
|
||
" 1 2973190 \n",
|
||
" 2 9147420 \n",
|
||
" 3 1811570 \n",
|
||
" 4 770880 \n",
|
||
" .. ... \n",
|
||
" 230 100 \n",
|
||
" 231 12170 \n",
|
||
" 232 260 \n",
|
||
" 233 10 \n",
|
||
" 234 0 \n",
|
||
" \n",
|
||
" [235 rows x 7 columns],\n",
|
||
" Year Population YearlyPer Yearly Median Fertility Density\n",
|
||
" 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
|
||
" 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
|
||
" 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
|
||
" 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
|
||
" 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
|
||
" 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
|
||
" 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
|
||
" Country Capital Continent\n",
|
||
" 0 Afghanistan Kabul Asia\n",
|
||
" 1 Albania Tirana Europe\n",
|
||
" 2 Algeria Algiers Africa\n",
|
||
" 3 American Samoa Pago Pago Oceania\n",
|
||
" 4 Andorra Andorra la Vella Europe\n",
|
||
" .. ... ... ...\n",
|
||
" 229 Wallis and Futuna Mata-Utu Oceania\n",
|
||
" 230 Western Sahara El Aai?�n Africa\n",
|
||
" 231 Yemen Sanaa Asia\n",
|
||
" 232 Zambia Lusaka Africa\n",
|
||
" 233 Zimbabwe Harare Africa\n",
|
||
" \n",
|
||
" [234 rows x 3 columns])"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"from woodwork.logical_types import Categorical, Datetime\n",
|
||
"\n",
|
||
"info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n",
|
||
"forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n",
|
||
"capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n",
|
||
"forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n",
|
||
"info[\"Population2020\"] = info[\"Population2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"Yearly\"] = info[\"Yearly\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"info[\"NetChange\"] = info[\"NetChange\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"LandArea\"] = info[\"LandArea\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"\n",
|
||
"info, forcast, capitals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Создание сущностей в featuretools\n",
|
||
"\n",
|
||
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" No relationships"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = ft.EntitySet(id=\"countries\")\n",
|
||
"\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"countries\",\n",
|
||
" dataframe=info,\n",
|
||
" index=\"no\",\n",
|
||
" logical_types={\n",
|
||
" \"Country\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"capitals\",\n",
|
||
" dataframe=capitals,\n",
|
||
" index=\"Country\",\n",
|
||
" logical_types={\n",
|
||
" \"Country\": Categorical,\n",
|
||
" \"Capital\": Categorical,\n",
|
||
" \"Continent\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"forcast\",\n",
|
||
" dataframe=forcast,\n",
|
||
" index=\"forcast_id\",\n",
|
||
" make_index=True,\n",
|
||
" logical_types={\n",
|
||
" \"Year\": Datetime,\n",
|
||
" },\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Настройка связей между сущностями featuretools\n",
|
||
"\n",
|
||
"Настройка связей между таблицами на уровне ключей\n",
|
||
"\n",
|
||
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" countries.Country -> capitals.Country"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
||
"\n",
|
||
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
||
"\n",
|
||
"Результат помещается в Dataframe feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>Yearly</th>\n",
|
||
" <th>NetChange</th>\n",
|
||
" <th>LandArea</th>\n",
|
||
" <th>capitals.Capital</th>\n",
|
||
" <th>capitals.Continent</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Beijing</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>New Delhi</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Washington, D.C.</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Jakarta</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Islamabad</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>Brades</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>Stanley</td>\n",
|
||
" <td>South America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>Alofi</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Nukunonu</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 Yearly NetChange LandArea \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5540090 9388211 \n",
|
||
"2 India 1380004385 0.99 13586631 2973190 \n",
|
||
"3 United States 331002651 0.59 1937734 9147420 \n",
|
||
"4 Indonesia 273523615 1.07 2898047 1811570 \n",
|
||
"5 Pakistan 220892340 2.00 4327022 770880 \n",
|
||
".. ... ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 100 \n",
|
||
"232 Falkland Islands 3480 3.05 103 12170 \n",
|
||
"233 Niue 1626 0.68 11 260 \n",
|
||
"234 Tokelau 1357 1.27 17 10 \n",
|
||
"235 Holy See 801 0.25 2 0 \n",
|
||
"\n",
|
||
" capitals.Capital capitals.Continent \n",
|
||
"no \n",
|
||
"1 Beijing Asia \n",
|
||
"2 New Delhi Asia \n",
|
||
"3 Washington, D.C. North America \n",
|
||
"4 Jakarta Asia \n",
|
||
"5 Islamabad Asia \n",
|
||
".. ... ... \n",
|
||
"231 Brades North America \n",
|
||
"232 Stanley South America \n",
|
||
"233 Alofi Oceania \n",
|
||
"234 Nukunonu Oceania \n",
|
||
"235 NaN NaN \n",
|
||
"\n",
|
||
"[235 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name=\"countries\",\n",
|
||
" max_depth=1,\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Полученные признаки\n",
|
||
"\n",
|
||
"Список колонок полученного dataframe'а"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[<Feature: Country>,\n",
|
||
" <Feature: Population2020>,\n",
|
||
" <Feature: Yearly>,\n",
|
||
" <Feature: NetChange>,\n",
|
||
" <Feature: LandArea>,\n",
|
||
" <Feature: capitals.Capital>,\n",
|
||
" <Feature: capitals.Continent>]"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_defs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Отсечение значений признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определение выбросов с помощью boxplot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries.boxplot(column=\"Population2020\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отсечение данных для признака Население, значение которых больше 50000000\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationClip</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationClip\n",
|
||
"no \n",
|
||
"1 China 1439323776 50000000\n",
|
||
"2 India 1380004385 50000000\n",
|
||
"3 United States 331002651 50000000\n",
|
||
"4 Indonesia 273523615 50000000\n",
|
||
"5 Pakistan 220892340 50000000\n",
|
||
"6 Brazil 212559417 50000000\n",
|
||
"7 Nigeria 206139589 50000000\n",
|
||
"8 Bangladesh 164689383 50000000\n",
|
||
"9 Russia 145934462 50000000\n",
|
||
"10 Mexico 128932753 50000000\n",
|
||
"11 Japan 126476461 50000000\n",
|
||
"12 Ethiopia 114963588 50000000\n",
|
||
"13 Philippines 109581078 50000000\n",
|
||
"14 Egypt 102334404 50000000\n",
|
||
"15 Vietnam 97338579 50000000\n",
|
||
"16 DR Congo 89561403 50000000\n",
|
||
"17 Turkey 84339067 50000000\n",
|
||
"18 Iran 83992949 50000000\n",
|
||
"19 Germany 83783942 50000000\n",
|
||
"20 Thailand 69799978 50000000\n",
|
||
"21 United Kingdom 67886011 50000000\n",
|
||
"22 France 65273511 50000000\n",
|
||
"23 Italy 60461826 50000000\n",
|
||
"24 Tanzania 59734218 50000000\n",
|
||
"25 South Africa 59308690 50000000\n",
|
||
"26 Myanmar 54409800 50000000\n",
|
||
"27 Kenya 53771296 50000000\n",
|
||
"28 South Korea 51269185 50000000\n",
|
||
"29 Colombia 50882891 50000000"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries_norm = countries.copy()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000)\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
|
||
" [\"Country\", \"Population2020\", \"PopulationClip\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Винсоризация признака Возраст"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"111195830.99999991\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationWinsorized</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationWinsorized\n",
|
||
"no \n",
|
||
"1 China 1439323776 114963588\n",
|
||
"2 India 1380004385 114963588\n",
|
||
"3 United States 331002651 114963588\n",
|
||
"4 Indonesia 273523615 114963588\n",
|
||
"5 Pakistan 220892340 114963588\n",
|
||
"6 Brazil 212559417 114963588\n",
|
||
"7 Nigeria 206139589 114963588\n",
|
||
"8 Bangladesh 164689383 114963588\n",
|
||
"9 Russia 145934462 114963588\n",
|
||
"10 Mexico 128932753 114963588\n",
|
||
"11 Japan 126476461 114963588\n",
|
||
"12 Ethiopia 114963588 114963588\n",
|
||
"13 Philippines 109581078 109581078\n",
|
||
"14 Egypt 102334404 102334404\n",
|
||
"15 Vietnam 97338579 97338579\n",
|
||
"16 DR Congo 89561403 89561403\n",
|
||
"17 Turkey 84339067 84339067\n",
|
||
"18 Iran 83992949 83992949\n",
|
||
"19 Germany 83783942 83783942\n",
|
||
"20 Thailand 69799978 69799978\n",
|
||
"21 United Kingdom 67886011 67886011\n",
|
||
"22 France 65273511 65273511\n",
|
||
"23 Italy 60461826 60461826\n",
|
||
"24 Tanzania 59734218 59734218\n",
|
||
"25 South Africa 59308690 59308690\n",
|
||
"26 Myanmar 54409800 54409800\n",
|
||
"27 Kenya 53771296 53771296\n",
|
||
"28 South Korea 51269185 51269185\n",
|
||
"29 Colombia 50882891 50882891"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.stats.mstats import winsorize\n",
|
||
"\n",
|
||
"print(countries_norm[\"Population2020\"].quantile(q=0.95))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
|
||
" countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n",
|
||
" (0, 0.05),\n",
|
||
" inplace=False,\n",
|
||
")\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
|
||
" [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Нормализация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationNorm</th>\n",
|
||
" <th>PopulationClipNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm2</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>1.000000e+00</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.587866e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.299705e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.900357e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.534691e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>2.911786e-06</td>\n",
|
||
" <td>0.000084</td>\n",
|
||
" <td>0.000036</td>\n",
|
||
" <td>-0.999927</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>1.861292e-06</td>\n",
|
||
" <td>0.000054</td>\n",
|
||
" <td>0.000023</td>\n",
|
||
" <td>-0.999953</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>5.731862e-07</td>\n",
|
||
" <td>0.000017</td>\n",
|
||
" <td>0.000007</td>\n",
|
||
" <td>-0.999986</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>3.862927e-07</td>\n",
|
||
" <td>0.000011</td>\n",
|
||
" <td>0.000005</td>\n",
|
||
" <td>-0.999990</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationNorm PopulationClipNorm \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 1.000000e+00 1.000000 \n",
|
||
"2 India 1380004385 9.587866e-01 1.000000 \n",
|
||
"3 United States 331002651 2.299705e-01 1.000000 \n",
|
||
"4 Indonesia 273523615 1.900357e-01 1.000000 \n",
|
||
"5 Pakistan 220892340 1.534691e-01 1.000000 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 2.911786e-06 0.000084 \n",
|
||
"232 Falkland Islands 3480 1.861292e-06 0.000054 \n",
|
||
"233 Niue 1626 5.731862e-07 0.000017 \n",
|
||
"234 Tokelau 1357 3.862927e-07 0.000011 \n",
|
||
"235 Holy See 801 0.000000e+00 0.000000 \n",
|
||
"\n",
|
||
" PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
|
||
"no \n",
|
||
"1 1.000000 1.000000 \n",
|
||
"2 1.000000 1.000000 \n",
|
||
"3 1.000000 1.000000 \n",
|
||
"4 1.000000 1.000000 \n",
|
||
"5 1.000000 1.000000 \n",
|
||
".. ... ... \n",
|
||
"231 0.000036 -0.999927 \n",
|
||
"232 0.000023 -0.999953 \n",
|
||
"233 0.000007 -0.999986 \n",
|
||
"234 0.000005 -0.999990 \n",
|
||
"235 0.000000 -1.000000 \n",
|
||
"\n",
|
||
"[235 rows x 6 columns]"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
||
"\n",
|
||
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country\",\n",
|
||
" \"Population2020\",\n",
|
||
" \"PopulationNorm\",\n",
|
||
" \"PopulationClipNorm\",\n",
|
||
" \"PopulationWinsorizedNorm\",\n",
|
||
" \"PopulationWinsorizedNorm2\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Стандартизация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>Population2020</th>\n",
|
||
" <th>PopulationStand</th>\n",
|
||
" <th>PopulationClipStand</th>\n",
|
||
" <th>PopulationWinsorizedStand</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>10.427597</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.987702</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.208627</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.782380</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.392082</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>-0.245950</td>\n",
|
||
" <td>-0.795071</td>\n",
|
||
" <td>-0.621969</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>-0.245962</td>\n",
|
||
" <td>-0.795158</td>\n",
|
||
" <td>-0.622019</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>-0.245975</td>\n",
|
||
" <td>-0.795265</td>\n",
|
||
" <td>-0.622080</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>-0.245977</td>\n",
|
||
" <td>-0.795280</td>\n",
|
||
" <td>-0.622089</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>-0.245982</td>\n",
|
||
" <td>-0.795312</td>\n",
|
||
" <td>-0.622107</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country Population2020 PopulationStand PopulationClipStand \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 10.427597 2.073933 \n",
|
||
"2 India 1380004385 9.987702 2.073933 \n",
|
||
"3 United States 331002651 2.208627 2.073933 \n",
|
||
"4 Indonesia 273523615 1.782380 2.073933 \n",
|
||
"5 Pakistan 220892340 1.392082 2.073933 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 -0.245950 -0.795071 \n",
|
||
"232 Falkland Islands 3480 -0.245962 -0.795158 \n",
|
||
"233 Niue 1626 -0.245975 -0.795265 \n",
|
||
"234 Tokelau 1357 -0.245977 -0.795280 \n",
|
||
"235 Holy See 801 -0.245982 -0.795312 \n",
|
||
"\n",
|
||
" PopulationWinsorizedStand \n",
|
||
"no \n",
|
||
"1 3.171659 \n",
|
||
"2 3.171659 \n",
|
||
"3 3.171659 \n",
|
||
"4 3.171659 \n",
|
||
"5 3.171659 \n",
|
||
".. ... \n",
|
||
"231 -0.621969 \n",
|
||
"232 -0.622019 \n",
|
||
"233 -0.622080 \n",
|
||
"234 -0.622089 \n",
|
||
"235 -0.622107 \n",
|
||
"\n",
|
||
"[235 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"stndart_scaler = preprocessing.StandardScaler()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country\",\n",
|
||
" \"Population2020\",\n",
|
||
" \"PopulationStand\",\n",
|
||
" \"PopulationClipStand\",\n",
|
||
" \"PopulationWinsorizedStand\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|