1901 lines
78 KiB
Plaintext
1901 lines
78 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование\n",
|
||
"\n",
|
||
"Преобразование категориального признака в несколько бинарных признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка набора дынных, преобразование данных в числовой формат."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"countries = pd.read_csv(\n",
|
||
" \"data/population.csv\", index_col=\"no\"\n",
|
||
")\n",
|
||
"#преобразуем данные в числовой формат, удаляем запятые\n",
|
||
"countries[\"Population 2020\"] = countries[\"Population 2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Net Change\"] = countries[\"Net Change\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries[\"Yearly Change\"] = countries[\"Yearly Change\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"countries[\"Land Area (Km²)\"] = countries[\"Land Area (Km²)\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"countries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Кодирование"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
||
"\n",
|
||
"# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
||
"\n",
|
||
"# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"# encoded_values_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Добавление признаков в исходный Dataframe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"# titanic"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Дискретизация признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы. первый вывод - ограничения по площади, второй - колво стран в каждой группе\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"num_bins = 3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"hist1, bins1 = np.histogram(\n",
|
||
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins=num_bins\n",
|
||
")\n",
|
||
"bins1, hist1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.cut(countries[\"Land Area (Km²)\"], list(bins1)),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.cut(countries[\"Land Area (Km²)\"], list(bins1), labels=labels),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
||
"bins2 = np.linspace(0, 12000000, 4)\n",
|
||
"\n",
|
||
"tmp_bins2 = np.digitize(\n",
|
||
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins2\n",
|
||
")\n",
|
||
"\n",
|
||
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
||
"\n",
|
||
"bins2, hist2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.cut(countries[\"Land Area (Km²)\"], list(bins2)),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.cut(countries[\"Land Area (Km²)\"], list(bins2), labels=labels),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
|
||
"hist3, bins3 = np.histogram(\n",
|
||
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()),\n",
|
||
" bins=[0, 1000, 100000, 500000, 3000000, np.inf],\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"bins3, hist3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.cut(countries[\"Land Area (Km²)\"], list(bins3)),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.cut(countries[\"Land Area (Km²)\"], list(bins3), labels=labels2),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Квантильное разделение данных на 5 групп\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.qcut(countries[\"Land Area (Km²)\"], q=5, labels=False),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [\n",
|
||
" countries[\"Country (or dependency)\"],\n",
|
||
" countries[\"Land Area (Km²)\"],\n",
|
||
" pd.qcut(countries[\"Land Area (Km²)\"], q=5, labels=labels2),\n",
|
||
" ],\n",
|
||
" axis=1,\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример конструирования признаков на основе существующих\n",
|
||
"\n",
|
||
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
|
||
"\n",
|
||
"Is_married - замужняя ли женщина\n",
|
||
"\n",
|
||
"Cabin_type - палуба (тип каюты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# titanic_cl = titanic.drop(\n",
|
||
"# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
|
||
"# )\n",
|
||
"# titanic_cl = titanic_cl.dropna()\n",
|
||
"\n",
|
||
"# titanic_cl[\"Title\"] = [\n",
|
||
"# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
|
||
"# ]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
|
||
"\n",
|
||
"# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
|
||
"\n",
|
||
"# titanic_cl"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
||
"\n",
|
||
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Загрузка данных\n",
|
||
"\n",
|
||
"За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
|
||
"\n",
|
||
"Используется только 100 первых заказов и связанные с ними объекты\n",
|
||
"\n",
|
||
"https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"( no Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
|
||
" 0 1 China 1439323776 0.39 5540090 \n",
|
||
" 1 2 India 1380004385 0.99 13586631 \n",
|
||
" 2 3 United States 331002651 0.59 1937734 \n",
|
||
" 3 4 Indonesia 273523615 1.07 2898047 \n",
|
||
" 4 5 Pakistan 220892340 2.00 4327022 \n",
|
||
" .. ... ... ... ... ... \n",
|
||
" 230 231 Montserrat 4992 0.06 3 \n",
|
||
" 231 232 Falkland Islands 3480 3.05 103 \n",
|
||
" 232 233 Niue 1626 0.68 11 \n",
|
||
" 233 234 Tokelau 1357 1.27 17 \n",
|
||
" 234 235 Holy See 801 0.25 2 \n",
|
||
" \n",
|
||
" Density(P/Km²) Land Area (Km²) \n",
|
||
" 0 153 9388211 \n",
|
||
" 1 464 2973190 \n",
|
||
" 2 36 9147420 \n",
|
||
" 3 151 1811570 \n",
|
||
" 4 287 770880 \n",
|
||
" .. ... ... \n",
|
||
" 230 50 100 \n",
|
||
" 231 0 12170 \n",
|
||
" 232 6 260 \n",
|
||
" 233 136 10 \n",
|
||
" 234 2,003 0 \n",
|
||
" \n",
|
||
" [235 rows x 7 columns],\n",
|
||
" Year Population YearlyPer Yearly Median Fertility Density\n",
|
||
" 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
|
||
" 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
|
||
" 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
|
||
" 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
|
||
" 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
|
||
" 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
|
||
" 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
|
||
" Country/Territory Capital Continent\n",
|
||
" 0 Afghanistan Kabul Asia\n",
|
||
" 1 Albania Tirana Europe\n",
|
||
" 2 Algeria Algiers Africa\n",
|
||
" 3 American Samoa Pago Pago Oceania\n",
|
||
" 4 Andorra Andorra la Vella Europe\n",
|
||
" .. ... ... ...\n",
|
||
" 229 Wallis and Futuna Mata-Utu Oceania\n",
|
||
" 230 Western Sahara El Aain Africa\n",
|
||
" 231 Yemen Sanaa Asia\n",
|
||
" 232 Zambia Lusaka Africa\n",
|
||
" 233 Zimbabwe Harare Africa\n",
|
||
" \n",
|
||
" [234 rows x 3 columns])"
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"from woodwork.logical_types import Categorical, Datetime\n",
|
||
"\n",
|
||
"info = pd.read_csv(\"data/population.csv\")\n",
|
||
"forcast = pd.read_csv(\"data/forcast.csv\")\n",
|
||
"capitals = pd.read_csv(\"data/country.csv\", encoding=\"ISO-8859-1\")\n",
|
||
"forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info = info.drop(\n",
|
||
" [\"Migrants (net)\", \"Fert. Rate\", \"MedAge\", \"Urban Pop %\", \"World Share\"], axis=1\n",
|
||
")\n",
|
||
"info[\"Population 2020\"] = info[\"Population 2020\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"Yearly Change\"] = info[\"Yearly Change\"].apply(\n",
|
||
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
||
")\n",
|
||
"info[\"Net Change\"] = info[\"Net Change\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"info[\"Land Area (Km²)\"] = info[\"Land Area (Km²)\"].apply(\n",
|
||
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
||
")\n",
|
||
"\n",
|
||
"info, forcast, capitals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Создание сущностей в featuretools\n",
|
||
"\n",
|
||
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" No relationships"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = ft.EntitySet(id=\"countries\")\n",
|
||
"\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"countries\",\n",
|
||
" dataframe=info,\n",
|
||
" index=\"no\",\n",
|
||
" logical_types={\n",
|
||
" \"Country (or dependency)\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"capitals\",\n",
|
||
" dataframe=capitals,\n",
|
||
" index=\"Country/Territory\",\n",
|
||
" logical_types={\n",
|
||
" \"Country/Territory\": Categorical,\n",
|
||
" \"Capital\": Categorical,\n",
|
||
" \"Continent\": Categorical,\n",
|
||
" },\n",
|
||
")\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"forcast\",\n",
|
||
" dataframe=forcast,\n",
|
||
" index=\"forcast_id\",\n",
|
||
" make_index=True,\n",
|
||
" logical_types={\n",
|
||
" \"Year\": Datetime,\n",
|
||
" },\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Настройка связей между сущностями featuretools\n",
|
||
"\n",
|
||
"Настройка связей между таблицами на уровне ключей\n",
|
||
"\n",
|
||
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Entityset: countries\n",
|
||
" DataFrames:\n",
|
||
" countries [Rows: 235, Columns: 7]\n",
|
||
" capitals [Rows: 234, Columns: 3]\n",
|
||
" forcast [Rows: 7, Columns: 8]\n",
|
||
" Relationships:\n",
|
||
" countries.Country (or dependency) -> capitals.Country/Territory"
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"es = es.add_relationship(\n",
|
||
" \"capitals\", \"Country/Territory\", \"countries\", \"Country (or dependency)\"\n",
|
||
")\n",
|
||
"\n",
|
||
"es"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
||
"\n",
|
||
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
||
"\n",
|
||
"Результат помещается в Dataframe feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country (or dependency)</th>\n",
|
||
" <th>Population 2020</th>\n",
|
||
" <th>Yearly Change</th>\n",
|
||
" <th>Net Change</th>\n",
|
||
" <th>Land Area (Km²)</th>\n",
|
||
" <th>capitals.Capital</th>\n",
|
||
" <th>capitals.Continent</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>5540090</td>\n",
|
||
" <td>9388211</td>\n",
|
||
" <td>Beijing</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>0.99</td>\n",
|
||
" <td>13586631</td>\n",
|
||
" <td>2973190</td>\n",
|
||
" <td>New Delhi</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>1937734</td>\n",
|
||
" <td>9147420</td>\n",
|
||
" <td>Washington, D.C.</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.07</td>\n",
|
||
" <td>2898047</td>\n",
|
||
" <td>1811570</td>\n",
|
||
" <td>Jakarta</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>4327022</td>\n",
|
||
" <td>770880</td>\n",
|
||
" <td>Islamabad</td>\n",
|
||
" <td>Asia</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>Brades</td>\n",
|
||
" <td>North America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>3.05</td>\n",
|
||
" <td>103</td>\n",
|
||
" <td>12170</td>\n",
|
||
" <td>Stanley</td>\n",
|
||
" <td>South America</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>0.68</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>260</td>\n",
|
||
" <td>Alofi</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>1.27</td>\n",
|
||
" <td>17</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>Nukunonu</td>\n",
|
||
" <td>Oceania</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 0.39 5540090 \n",
|
||
"2 India 1380004385 0.99 13586631 \n",
|
||
"3 United States 331002651 0.59 1937734 \n",
|
||
"4 Indonesia 273523615 1.07 2898047 \n",
|
||
"5 Pakistan 220892340 2.00 4327022 \n",
|
||
".. ... ... ... ... \n",
|
||
"231 Montserrat 4992 0.06 3 \n",
|
||
"232 Falkland Islands 3480 3.05 103 \n",
|
||
"233 Niue 1626 0.68 11 \n",
|
||
"234 Tokelau 1357 1.27 17 \n",
|
||
"235 Holy See 801 0.25 2 \n",
|
||
"\n",
|
||
" Land Area (Km²) capitals.Capital capitals.Continent \n",
|
||
"no \n",
|
||
"1 9388211 Beijing Asia \n",
|
||
"2 2973190 New Delhi Asia \n",
|
||
"3 9147420 Washington, D.C. North America \n",
|
||
"4 1811570 Jakarta Asia \n",
|
||
"5 770880 Islamabad Asia \n",
|
||
".. ... ... ... \n",
|
||
"231 100 Brades North America \n",
|
||
"232 12170 Stanley South America \n",
|
||
"233 260 Alofi Oceania \n",
|
||
"234 10 Nukunonu Oceania \n",
|
||
"235 0 NaN NaN \n",
|
||
"\n",
|
||
"[235 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es,\n",
|
||
" target_dataframe_name=\"countries\",\n",
|
||
" max_depth=1,\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Полученные признаки\n",
|
||
"\n",
|
||
"Список колонок полученного dataframe'а"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[<Feature: Country (or dependency)>,\n",
|
||
" <Feature: Population 2020>,\n",
|
||
" <Feature: Yearly Change>,\n",
|
||
" <Feature: Net Change>,\n",
|
||
" <Feature: Land Area (Km²)>,\n",
|
||
" <Feature: capitals.Capital>,\n",
|
||
" <Feature: capitals.Continent>]"
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_defs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Отсечение значений признаков"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Определение выбросов с помощью boxplot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Axes: >"
|
||
]
|
||
},
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGsCAYAAAAPJKchAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvbklEQVR4nO3df1RVdb7/8dfhAEdIwQzFH2FalNqVlCwRG1NSIC0m8jY52VXzTs630lkqeSv6oVmT9AMdrWz81oyWt/ydWqOkcimVEvMrRuWkpqkxmaDW0qNgcDzs7x8uzvUEKgfxfOTwfKzlWuzP/uy936e19jmv9v7s/bFZlmUJAADAkCDTBQAAgKaNMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMalRhZOPGjUpLS1P79u1ls9m0cuVKn/exZMkS9ezZU+Hh4brqqqv0yiuvNHyhAACgzhpVGCkrK1OPHj00e/bsem3/0Ucf6f7779dDDz2k7du364033tBf/vIXvf766w1cKQAAqCtbY50oz2azacWKFUpPT/e0VVRU6KmnntLChQt19OhRde/eXS+99JIGDBggSRo+fLhcLpeWLl3q2ea1117Tyy+/rOLiYtlsNj9/CgAA0KiujJzPuHHjVFBQoEWLFumrr77S7373O91+++3avXu3pNNhpVmzZl7bhIWF6YcfftD3339vomQAAJq8gAkjxcXFmjdvnpYuXap+/frpmmuu0aRJk/Sb3/xG8+bNkySlpqZq+fLlysvLU1VVlb799ltNnz5dknTw4EGT5QMA0GQFmy6goXz99ddyu9267rrrvNorKip0xRVXSJLGjBmj7777TnfeeadcLpciIiI0fvx4PfvsswoKCphcBgBAoxIwYeTEiROy2+0qLCyU3W73Wte8eXNJp8eZvPTSS5o2bZpKSkrUunVr5eXlSZKuvvpqv9cMAAACKIzEx8fL7Xbr0KFD6tev3zn72u12dejQQZK0cOFCJSYmqnXr1v4oEwAA/EqjCiMnTpzQnj17PMv79u1TUVGRWrVqpeuuu07333+/Ro4cqenTpys+Pl6HDx9WXl6ebrjhBt1xxx06cuSIli1bpgEDBuiXX37xjDHZsGGDwU8FAEDT1qge7V2/fr2SkpJqtI8aNUpvv/22XC6X/vznP2v+/Pk6cOCAoqKi1KdPH02dOlVxcXE6cuSI0tLS9PXXX8uyLCUmJuqFF15QQkKCgU8DAACkRhZGAABA4OEREgAAYBRhBAAAGNUoBrBWVVXpxx9/VIsWLXhlOwAAjYRlWTp+/Ljat29/zvd5NYow8uOPPyomJsZ0GQAAoB7+9a9/6corrzzr+kYRRlq0aCHp9IeJiIgwXA2AhuRyubRu3TqlpKQoJCTEdDkAGpDT6VRMTIznd/xsGkUYqb41ExERQRgBAozL5VJ4eLgiIiIII0CAOt8QCwawAgAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAwxu12a8OGDdq4caM2bNggt9ttuiQABhBGABixfPlyxcbGKjk5WTNmzFBycrJiY2O1fPly06UB8DPCCAC/W758ue655x7FxcUpPz9fCxcuVH5+vuLi4nTPPfcQSIAmxmZZlmW6iPNxOp2KjIzUsWPHmJsGaOTcbrdiY2MVFxenlStXyu12KycnR0OGDJHdbld6erq2b9+u3bt3y263my4XwAWo6+83V0YA+FV+fr7279+vJ598UkFB3l9BQUFByszM1L59+5Sfn2+oQgD+RhgB4FcHDx6UJHXv3r3W9dXt1f0ABD7CCAC/ateunSRp+/btta6vbq/uByDwEUYA+FW/fv3UqVMnTZs2TVVVVV7rqqqqlJWVpc6dO6tfv36GKgTgb4QRAH5lt9s1ffp0rVq1Sunp6dq8ebNOnjypzZs3Kz09XatWrVJ2djaDV4EmJNh0AQCanqFDh2rZsmV69NFHdeutt3raO3furGXLlmno0KEGqwPgbzzaC8AYt9utTz75RB999JEGDx6spKQkrogAAeSiPdq7ceNGpaWlqX379rLZbFq5cmWdt/3ss88UHBysnj17+npYAAHIbrerf//+uvXWW9W/f3+CCNBE+RxGysrK1KNHD82ePdun7Y4ePaqRI0dq4MCBvh4SAAAEMJ/HjAwePFiDBw/2+UAPPfSQhg8fLrvd7tPVFAAAENj8MoB13rx52rt3r9599139+c9/Pm//iooKVVRUeJadTqckyeVyyeVyXbQ6AfiX2+3W+vXrtXHjRjkcDg0YMIBbNUAAqetv9kUPI7t379YTTzyh/Px8BQfX7XBZWVmaOnVqjfZ169YpPDy8oUsEYEBBQYHmzZunQ4cOSZJmzJihNm3aaPTo0UpMTDRcHYCGUF5eXqd+FzWMuN1uDR8+XFOnTtV1111X5+0yMzOVkZHhWXY6nYqJiVFKSgpP0wABYMWKFXr55Zc1ZMgQTZo0SSUlJWrbtq2ys7P18ssva9GiRbr77rtNlwngAlXf2TifC3q012azacWKFUpPT691/dGjR3X55Zd7XXatqqqSZVmy2+1at26dbrvttvMeh0d7gcDBrL1A01HX3++LemUkIiJCX3/9tVfbG2+8oY8//ljLli1T586dL+bhAVyCqmftXbhwoYKCguR2uz3rqmft7du3r/Lz8zVgwABzhQLwG5/DyIkTJ7Rnzx7P8r59+1RUVKRWrVqpY8eOyszM1IEDBzR//nwFBQXVmJmzTZs2atas2Vln7AQQ2Ji1F8Cv+fyeka1btyo+Pl7x8fGSpIyMDMXHx2vy5MmSTn+BFBcXN2yVAAIGs/YC+DVeBw/ArxgzAjQdF+118ABwIZi1F8CvMWsvAL9j1l4AZ+I2DQBjmLUXCGyXxKO9AHAu1bP2lpWVMWsv0IQxZgQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUT6HkY0bNyotLU3t27eXzWbTypUrz9l/+fLlSk5OVuvWrRUREaHExEStXbu2vvUCAIAA43MYKSsrU48ePTR79uw69d+4caOSk5OVk5OjwsJCJSUlKS0tTV988YXPxQIAgMAT7OsGgwcP1uDBg+vcf+bMmV7L06ZN0wcffKB//OMfio+P9/XwAAAgwPgcRi5UVVWVjh8/rlatWp21T0VFhSoqKjzLTqdTkuRyueRyuS56jQD8p/qc5twGAk9dz2u/h5Hs7GydOHFC995771n7ZGVlaerUqTXa161bp/Dw8ItZHgBDcnNzTZcAoIGVl5fXqZ/Nsiyrvgex2WxasWKF0tPT69R/wYIFGjNmjD744AMNGjTorP1quzISExOjI0eOKCIior7lArgEuVwu5ebmKjk5WSEhIabLAdCAnE6noqKidOzYsXP+fvvtysiiRYv04IMPaunSpecMIpLkcDjkcDhqtIeEhPBlBQQozm8g8NT1nPbLe0YWLlyo0aNHa+HChbrjjjv8cUgAANBI+Hxl5MSJE9qzZ49ned++fSoqKlKrVq3UsWNHZWZm6sCBA5o/f76k07dmRo0apVmzZikhIUElJSWSpLCwMEVGRjbQxwAAAI2Vz1dGtm7dqvj4eM9juRkZGYqPj9fkyZMlSQcPHlRxcbGn/5tvvqlTp05p7Nixateuneff+PHjG+gjAACAxsznKyMDBgzQuca8vv32217L69ev9/UQAACgCWFuGgAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFE+h5GNGzcqLS1N7du3l81m08qVK8+7zfr163XjjTfK4XAoNjZWb7/9dj1KBQAAgcjnMFJWVqYePXpo9uzZdeq/b98+3XHHHUpKSlJRUZEmTJigBx98UGvXrvW5WAAAEHiCfd1g8ODBGjx4cJ37z5kzR507d9b06dMlSd26ddOnn36qv/zlL0pNTfX18AAAIMD4HEZ8VVBQoEGDBnm1paamasKECWfdpqKiQhUVFZ5lp9MpSXK5XHK5XBelTgBmVJ/TnNtA4KnreX3Rw0hJSYmio6O92qKjo+V0OnXy5EmFhYXV2CYrK0tTp06t0b5u3TqFh4dftFoBmJObm2u6BAANrLy8vE79LnoYqY/MzExlZGR4lp1Op2JiYpSSkqKIiAiDlQFoaC6XS7m5uUpOTlZISIjpcgA0oOo7G+dz0cNI27ZtVVpa6tVWWlqqiIiIWq+KSJLD4ZDD4ajRHhISwpcVEKA4v4HAU9dz+qK/ZyQxMVF5eXlebbm5uUpMTLzYhwYAAI2Az2HkxIkTKioqUlFRkaTTj+4WFRWpuLhY0ulbLCNHjvT0f+ihh7R371499thj2rlzp9544w0tWbJEEydObJhPAAAAGjWfw8jWrVsVHx+v+Ph4SVJGRobi4+M1efJkSdLBgwc9wUSSOnfurNWrVys3N1c9evTQ9OnT9be//Y3HegEAgKR6jBkZMGCALMs66/ra3q46YMAAffHFF74eCgAANAHMTQMAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCqXmFk9uzZ6tSpk5o1a6aEhARt2bLlnP1nzpypLl26KCwsTDExMZo4caJ++eWXehUMAAACi89hZPHixcrIyNCUKVO0bds29ejRQ6mpqTp06FCt/RcsWKAnnnhCU6ZM0Y4dO/T3v/9dixcv1pNPPnnBxQMAgMbP5zAyY8YMjRkzRqNHj9b111+vOXPmKDw8XHPnzq21/6ZNm3TLLbdo+PDh6tSpk1JSUnTfffed92oKAABoGoJ96VxZWanCwkJlZmZ62oKCgjRo0CAVFBTUuk3fvn317rvvasuWLerdu7f27t2rnJwcjRgx4qzHqaioUEVFhWfZ6XRKklwul1wuly8lA7jEVZ/TnNtA4Knree1TGDly5Ijcbreio6O92qOjo7Vz585atxk+fLiOHDmi3/zmN7IsS6dOndJDDz10zts0WVlZmjp1ao32devWKTw83JeSATQSubm5pksA0MDKy8vr1M+nMFIf69ev17Rp0/TGG28oISFBe/bs0fjx4/X888/rmWeeqXWbzMxMZWRkeJadTqdiYmKUkpKiiIiIi10yAD9yuVzKzc1VcnKyQkJCTJcDoAFV39k4H5/CSFRUlOx2u0pLS73aS0tL1bZt21q3eeaZZzRixAg9+OCDkqS4uDiVlZXpj3/8o5566ikFBdUctuJwOORwOGq0h4SE8GUFBCjObyDw1PWc9mkAa2hoqHr16qW8vDxPW1VVlfLy8pSYmFjrNuXl5TUCh91ulyRZluXL4QEAQADy+TZNRkaGRo0apZtuukm9e/fWzJkzVVZWptGjR0uSRo4cqQ4dOigrK0uSlJaWphkzZig+Pt5zm+aZZ55RWlqaJ5QAAICmy+cwMmzYMB0+fFiTJ09WSUmJevbsqTVr1ngGtRYXF3tdCXn66adls9n09NNP68CBA2rdurXS0tL0wgsvNNynAAAAjZbNagT3SpxOpyIjI3Xs2DEGsAIBxuVyKScnR0OGDGHMCBBg6vr7zdw0AADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjKpXGJk9e7Y6deqkZs2aKSEhQVu2bDln/6NHj2rs2LFq166dHA6HrrvuOuXk5NSrYAAAEFiCfd1g8eLFysjI0Jw5c5SQkKCZM2cqNTVVu3btUps2bWr0r6ysVHJystq0aaNly5apQ4cO+v7779WyZcuGqB8AADRyPoeRGTNmaMyYMRo9erQkac6cOVq9erXmzp2rJ554okb/uXPn6ueff9amTZsUEhIiSerUqdOFVQ0AAAKGT2GksrJShYWFyszM9LQFBQVp0KBBKigoqHWbDz/8UImJiRo7dqw++OADtW7dWsOHD9fjjz8uu91e6zYVFRWqqKjwLDudTkmSy+WSy+XypWQAl7jqc5pzGwg8dT2vfQojR44ckdvtVnR0tFd7dHS0du7cWes2e/fu1ccff6z7779fOTk52rNnjx555BG5XC5NmTKl1m2ysrI0derUGu3r1q1TeHi4LyUDaCRyc3NNlwCggZWXl9epn8+3aXxVVVWlNm3a6M0335TdblevXr104MABvfLKK2cNI5mZmcrIyPAsO51OxcTEKCUlRRERERe7ZAB+5HK5lJubq+TkZM+tXACBofrOxvn4FEaioqJkt9tVWlrq1V5aWqq2bdvWuk27du0UEhLidUumW7duKikpUWVlpUJDQ2ts43A45HA4arSHhITwZQUEKM5vIPDU9Zz26dHe0NBQ9erVS3l5eZ62qqoq5eXlKTExsdZtbrnlFu3Zs0dVVVWetm+//Vbt2rWrNYgAAICmxef3jGRkZOitt97SO++8ox07dujhhx9WWVmZ5+makSNHeg1wffjhh/Xzzz9r/Pjx+vbbb7V69WpNmzZNY8eObbhPAQAAGi2fx4wMGzZMhw8f1uTJk1VSUqKePXtqzZo1nkGtxcXFCgr634wTExOjtWvXauLEibrhhhvUoUMHjR8/Xo8//njDfQoAANBo2SzLskwXcT5Op1ORkZE6duwYA1iBAONyuZSTk6MhQ4YwZgQIMHX9/WZuGgAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFH1CiOzZ89Wp06d1KxZMyUkJGjLli112m7RokWy2WxKT0+vz2EBAEAA8jmMLF68WBkZGZoyZYq2bdumHj16KDU1VYcOHTrndvv379ekSZPUr1+/ehcLAAACj89hZMaMGRozZoxGjx6t66+/XnPmzFF4eLjmzp171m3cbrfuv/9+TZ06VVdfffUFFQwAAAJLsC+dKysrVVhYqMzMTE9bUFCQBg0apIKCgrNu99xzz6lNmzb6wx/+oPz8/PMep6KiQhUVFZ5lp9MpSXK5XHK5XL6UDOASV31Oc24Dgaeu57VPYeTIkSNyu92Kjo72ao+OjtbOnTtr3ebTTz/V3//+dxUVFdX5OFlZWZo6dWqN9nXr1ik8PNyXkgE0Erm5uaZLANDAysvL69TPpzDiq+PHj2vEiBF66623FBUVVeftMjMzlZGR4Vl2Op2KiYlRSkqKIiIiLkapAAxxuVzKzc1VcnKyQkJCTJcDoAFV39k4H5/CSFRUlOx2u0pLS73aS0tL1bZt2xr9v/vuO+3fv19paWmetqqqqtMHDg7Wrl27dM0119TYzuFwyOFw1GgPCQnhywoIUJzfQOCp6znt0wDW0NBQ9erVS3l5eZ62qqoq5eXlKTExsUb/rl276uuvv1ZRUZHn329/+1slJSWpqKhIMTExvhweAAAEIJ9v02RkZGjUqFG66aab1Lt3b82cOVNlZWUaPXq0JGnkyJHq0KGDsrKy1KxZM3Xv3t1r+5YtW0pSjXYAANA0+RxGhg0bpsOHD2vy5MkqKSlRz549tWbNGs+g1uLiYgUF8WJXAABQNzbLsizTRZyP0+lUZGSkjh07xgBWIMC4XC7l5ORoyJAhjBkBAkxdf7+5hAEAAIwijAAAAKMIIwAAwCjCCAAAMIowAsAYt9utDRs2aOPGjdqwYYPcbrfpkgAYQBgBYMTy5csVGxur5ORkzZgxQ8nJyYqNjdXy5ctNlwbAzwgjAPxu+fLluueeexQXF6f8/HwtXLhQ+fn5iouL0z333EMgAZoY3jMCwK/cbrdiY2MVFxenlStXyu12e94zYrfblZ6eru3bt2v37t2y2+2mywVwAXjPCIBLUn5+vvbv368nn3yyxtuag4KClJmZqX379ik/P99QhQD8jTACwK8OHjwo6ezzU1W3V/cDEPgIIwD8ql27dpKk7du317q+ur26H4DARxgB4Ff9+vVTp06dNG3aNFVVVXmtq6qqUlZWljp37qx+/foZqhCAvxFGAPiV3W7X9OnTtWrVKqWnp2vz5s06efKkNm/erPT0dK1atUrZ2dkMXgWakGDTBQBoeoYOHaply5bp0Ucf1a233upp79y5s5YtW6ahQ4carA6Av/FoLwBj3G63PvnkE3300UcaPHiwkpKSuCICBJC6/n5zZQSAMXa7Xf3791dZWZn69+9PEAGaKMaMAAAAowgjAADAKMIIAGOYtReARBgBYAiz9gKoRhgB4HfM2gvgTDzaC8CvmLUXaDqYtRfAJYlZewH8GmEEgF8xay+AXyOMAPArZu0F8GuEEQB+xay9AH6NMALAr5i1F8CvMTcNAL9j1l4AZ+LRXgDGMGsvENiYtRfAJY9ZewFIjBkBYBBz0wCQCCMADGFuGgDVCCMA/K56bpru3bvr1Vdf1bhx4/Tqq6+qe/fuzE0DNEEMYAXgV9Vz00RFRenw4cP6/vvvPeuuuuoqtW7dWj/99BNz0wABgAGsAC5J1XPT7N+/X2FhYV7rDh065Akn+fn5GjBggIEKAfgbt2kA+NWBAwc8fw8cOFD5+flauHCh8vPzNXDgwFr7AQhs9Qojs2fPVqdOndSsWTMlJCRoy5YtZ+371ltvqV+/frr88st1+eWXa9CgQefsDyCwlZaWSpJ69OihDz74QAkJCQoLC1NCQoI++OAD3XDDDV79AAQ+n8PI4sWLlZGRoSlTpmjbtm3q0aOHUlNTdejQoVr7r1+/Xvfdd58++eQTFRQUKCYmRikpKfxfD9BE/fTTT5JU4xZNtfDwcK9+AAKfz2FkxowZGjNmjEaPHq3rr79ec+bMUXh4uObOnVtr//fee0+PPPKIevbsqa5du+pvf/ubqqqqlJeXd8HFA2h8goJOf+1Uz0Xz67lpNm/e7NUPQODzaQBrZWWlCgsLlZmZ6WkLCgrSoEGDVFBQUKd9lJeXy+VyqVWrVmftU1FRoYqKCs+y0+mUJLlcLrlcLl9KBnCJqZ6Nt0uXLvrqq6+85qbp1KmTunTpol27dqlfv36c70AjV9dz2KcwcuTIEbndbkVHR3u1R0dHa+fOnXXax+OPP6727dtr0KBBZ+2TlZWlqVOn1mhft26d5xIugMbJ7XYrMjJSu3btUq9evZSSkiKHw6GKigpt27ZNhYWFioyM1IkTJ5STk2O6XAAXoLy8vE79/Ppo74svvqhFixZp/fr1atas2Vn7ZWZmKiMjw7PsdDo9Y014zwjQ+L355pv6/e9/r2+++UaFhYWe9vDwcNlsNr355ptKS0szWCGAhlB9Z+N8fAojUVFRstvtNUa5l5aWqm3btufcNjs7Wy+++KL+53/+xzNa/mwcDoccDkeN9pCQEIWEhPhSMoBL0L333qvg4GA9+uij2r9/v6c9Ojpa2dnZGjp0qLniADSYuv5m+zRCLDQ0VL169fIafFo9GDUxMfGs27388st6/vnntWbNGt10002+HBJAgBo6dKh27dql7OxsDRkyRNnZ2dq5cydBBGiCfB6unpGRobfeekvvvPOOduzYoYcfflhlZWUaPXq0JGnkyJFeA1xfeuklPfPMM5o7d646deqkkpISlZSU6MSJEw33KQA0OsuXL1eXLl00adIk5eTkaNKkSerSpQvz0gBNkM9hZNiwYcrOztbkyZPVs2dPFRUVac2aNZ5BrcXFxTp48KCn/1//+ldVVlbqnnvuUbt27Tz/srOzG+5TAGhUqifKi4uL83oDa1xcHBPlAU0QE+UB8KvqifLi4uK0cuVKud1u5eTkaMiQIbLb7UpPT9f27duZKA8IAHX9/eatQgD8qnqivCeffFKWZWnDhg3auHGjNmzYIMuylJmZqX379ik/P990qQD8hDACwK+qb+N+9913io2NVXJysmbMmKHk5GTFxsZq7969Xv0ABD7CCAC/ateunSRpxIgRtY4ZGTFihFc/AIGPMSMA/KqyslKXXXaZrrjiCv3www+yLMszZsRms+nKK6/UTz/9pLKyMoWGhpouF8AFqOvvt1/fwAoAmzZt0qlTp1RaWqq7775bycnJ2r17t77//nvl5uZ6Xqq4adMmDRgwwGyxAPyCMALAr6rHgowfP16vv/66Vq1a5Vlnt9s1fvx4zZo1izEjQBPCmBEAflU9FmTWrFk1bsOEhoZq1qxZXv0ABD7CCAC/6tu3r4KCTn/1DBw40GsA68CBAyVJQUFB6tu3r8kyAfgRYQSAX+Xn56uqqkqSZFmWtm3bps8++0zbtm1T9Xj6qqoq3jMCNCGMGQHgV+vXr5d0eube999/X6tXr/ass9vtuvfee7VkyRKtX7/ec6UEQGDjyggAI5YsWVLrmJElS5YYqgiAKVwZAeBX/fr18/x92223KTU1Vd9++62uu+46rV271nOl5Mx+AAIbYQSAMXl5eV63aZo1a2awGgCmcJsGgF+dOTD1l19+8Vp35jIDWIGmgzACwK+qn6SRal4JOXP5zH4AAhthBIBftWzZUtLp4BEVFeW1LioqyhNIqvsBCHyMGQHgV0ePHpV0+pbMDz/84LXuzOXqfgACH1dGAACAUYQRAH7VvHlzz98dOnTwWnfllVfW2g9AYCOMAPCrjz/+2PP3kSNHvNYdPny41n4AAhthBIBfnTkWpKKiwmvdmcuMGQGaDsIIAL+68cYbG7QfgMaPMALAr+68884G7Qeg8SOMAPCrRYsWNWg/AI0fYQSAX+3bt69B+wFo/HjpGQC/OnnypOfv1q1b6/rrr9fhw4fVunVrffPNN54nas7sByCwEUYA+NWZT8zcdNNNyszM1IEDB9ShQwdlZWXpo48+qtEPQGAjjADwqzOveKxZs8YTPiQpKCio1n4AAhthBIDPysvLtXPnznptGxER4fnbsiyvdWfO1BsREaFt27b5vP+uXbsqPDy8XrUBMIMwAsBnO3fuVK9evS7qMb766qt6HaOwsJB3lACNDGEEgM+6du2qwsLCem1bWVmpvn371rgqciabzaZNmzYpNDS0XrUBaFwIIwB8Fh4efkFXHyZNmqRXXnnlnOv79OlT7/0DaFwIIwD87uWXX5YkzZgxQ26329MeHBysiRMnetYDaBps1rmulV4inE6nIiMjdezYMa/BbwAat8rKSj3155f1f1d/rv9zR4JeePqxet2aAXBpquvvN1dGgCZm35EylVWcMl2GR5/fjtTSynj1+W2cvj18UtKl8UjvZY5gdY66zHQZQJNQrzAye/ZsvfLKKyopKVGPHj302muvqXfv3mftv3TpUj3zzDPav3+/rr32Wr300ksaMmRIvYsGUD/7jpQpKXu96TJq9eiyr02XUMMnkwYQSAA/8DmMLF68WBkZGZozZ44SEhI0c+ZMpaamateuXWrTpk2N/ps2bdJ9992nrKws3XnnnVqwYIHS09O1bds2de/evUE+BIC6qb4iMnNYT8W2aW64mtPKTlZo1foC3TkgUZeFOUyXI0nac+iEJiwuuqSuIAGBzOcwMmPGDI0ZM0ajR4+WJM2ZM0erV6/W3Llz9cQTT9ToP2vWLN1+++36r//6L0nS888/r9zcXL3++uuaM2fOBZYPwFe2YKfszQ4oqNmlEUbCgk+p/eU/KqxFiYKCL407x/ZmJ2QLdpouA2gyfDrzKysrVVhYqMzMTE9bUFCQBg0apIKCglq3KSgoUEZGhldbamqqVq5cedbjVFRUeM1L4XSe/lJwuVxyuVy+lAzgDMdPViik5ed6css006XU8MaaN0yX4CWk5UCdOpXCdw5wAep6/vgURo4cOSK3263o6Giv9ujo6LO+GrqkpKTW/iUlJWc9TlZWlqZOnVqjfd26dbzmGbgABaU2uY4m6NSJ6y9oP5VH/qWfVmU3UFUN64o7Jyk0KuaC92OdaqH/V/Cpvg9rgKKAJqq8vLxO/S6Na6K/kpmZ6XU1xel0KiYmRikpKTzaC1yAPmWVittxSFe3vkxhIfZ67+fkyXLtH5zUIDW5T7n19ddfKy4uTvbg+tdUrdM11yos7ML/p+Uyh12drmDwKnAhqu9snI9PYSQqKkp2u12lpaVe7aWlpWrbtm2t27Rt29an/pLkcDjkcNQcyBYSEqKQkBBfSgZwhuiWIbo/sXMD7OkKJXa98KsP0unLuC1UriFDBnB+AwGmrud00Pm7/K/Q0FD16tVLeXl5nraqqirl5eUpMTGx1m0SExO9+ktSbm7uWfsDAICmxefbNBkZGRo1apRuuukm9e7dWzNnzlRZWZnn6ZqRI0eqQ4cOysrKkiSNHz9e/fv31/Tp03XHHXdo0aJF2rp1q958882G/SQAAKBR8jmMDBs2TIcPH9bkyZNVUlKinj17as2aNZ5BqsXFxQoK+t8LLn379tWCBQv09NNP68knn9S1116rlStX8o4RAAAgiblpABjmcrmUk5OjIUOGMGYECDB1/f32acwIAABAQyOMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIzy+XXwJlS/JLauUxEDaDxcLpfKy8vldDp5AysQYKp/t8/3svdGEUaOHz8uSYqJaZgpywEAgP8cP35ckZGRZ13fKOamqaqq0o8//qgWLVrIZrOZLgdAA3I6nYqJidG//vUv5p4CAoxlWTp+/Ljat2/vNYnurzWKMAIgcDERJgAGsAIAAKMIIwAAwCjCCACjHA6HpkyZIofDYboUAIYwZgQAABjFlREAAGAUYQQAABhFGAEAAEYRRgDU2YABAzRhwoRLZj8AAgNhBGgEHnjgAdlsNtlsNoWGhio2NlbPPfecTp06Zbq0c1q/fr1sNpuOHj3q1b58+XI9//zzF/XYX375pe677z7FxMQoLCxM3bp106xZs2qt8cYbb5TD4VBsbKzefvttr/VZWVm6+eab1aJFC7Vp00bp6enatWuXV59ffvlFY8eO1RVXXKHmzZvr3//931VaWnoxPx4QUAgjQCNx++236+DBg9q9e7ceffRRPfvss3rllVdMl1UvrVq1UosWLS7qMQoLC9WmTRu9++67+uc//6mnnnpKmZmZev311z199u3bpzvuuENJSUkqKirShAkT9OCDD2rt2rWePhs2bNDYsWO1efNm5ebmyuVyKSUlRWVlZZ4+EydO1D/+8Q8tXbpUGzZs0I8//qihQ4de1M8HBBQLwCVv1KhR1l133eXVlpycbPXp08eyLMv6+eefrREjRlgtW7a0wsLCrNtvv9369ttvPX3nzZtnRUZGWitWrLBiY2Mth8NhpaSkWMXFxec8xvjx463+/ft7lvv372+NHz/eszx//nyrV69eVvPmza3o6Gjrvvvus0pLSy3Lsqx9+/ZZkrz+jRo1qtb91LX+NWvWWF27drUuu+wyKzU11frxxx99+u/4yCOPWElJSZ7lxx57zPq3f/s3rz7Dhg2zUlNTz7qPQ4cOWZKsDRs2WJZlWUePHrVCQkKspUuXevrs2LHDkmQVFBT4VB/QVHFlBGikwsLCVFlZKen0bZytW7fqww8/VEFBgSzL0pAhQ+RyuTz9y8vL9cILL2j+/Pn67LPPdPToUf3+97+/oBpcLpeef/55ffnll1q5cqX279+vBx54QNLpWbbff/99SdKuXbt08ODBWm+T+FJ/dna2/vu//1sbN25UcXGxJk2a5FO9x44dU6tWrTzLBQUFGjRokFef1NRUFRQUnHMfkjz7KSwslMvl8tpP165d1bFjx3PuB8D/CjZdAADfWJalvLw8rV27Vn/605+0e/duffjhh/rss8/Ut29fSdJ7772nmJgYrVy5Ur/73e8knQ4Or7/+uhISEiRJ77zzjrp166YtW7aod+/e9arlP//zPz1/X3311Xr11Vd1880368SJE2revLnnB7tNmzZq2bJlrfvwpf45c+bommuukSSNGzdOzz33XJ1r3bRpkxYvXqzVq1d72kpKShQdHe3VLzo6Wk6nUydPnlRYWJjXuqqqKk2YMEG33HKLunfv7tlHaGhojc8XHR2tkpKSOtcHNGVcGQEaiVWrVql58+Zq1qyZBg8erGHDhunZZ5/Vjh07FBwc7AkZknTFFVeoS5cu2rFjh6ctODhYN998s2e5a9euatmypVcfXxUWFiotLU0dO3ZUixYt1L9/f0lScXFxnfdR1/rDw8M9QUSS2rVrp0OHDtXpGNu3b9ddd92lKVOmKCUlpc61/drYsWO1fft2LVq0qN77AFATV0aARiIpKUl//etfFRoaqvbt2ys4uGFP36CgIFm/mh3izNskv1ZWVqbU1FSlpqbqvffeU+vWrVVcXKzU1FTP7aOGFBIS4rVss9lq1Fubb775RgMHDtQf//hHPf30017r2rZtW+Opl9LSUkVERNS4KjJu3DitWrVKGzdu1JVXXum1j8rKSh09etTr6khpaanatm1b148HNGlcGQEaicsuu0yxsbHq2LGjVxDp1q2bTp06pc8//9zT9tNPP2nXrl26/vrrPW2nTp3S1q1bPcu7du3S0aNH1a1bN0lS69atdfDgQa9jFhUVnbWenTt36qefftKLL76ofv36qWvXrjWuVISGhkqS3G73WfdT1/rr45///KeSkpI0atQovfDCCzXWJyYmKi8vz6stNzdXiYmJnmXLsjRu3DitWLFCH3/8sTp37uzVv1evXgoJCfHaz65du1RcXOy1HwBnRxgBGrlrr71Wd911l8aMGaNPP/1UX375pf7jP/5DHTp00F133eXpFxISoj/96U/6/PPPVVhYqAceeEB9+vTxjBe57bbbtHXrVs2fP1+7d+/WlClTtH379rMet2PHjgoNDdVrr72mvXv36sMPP6zx7pCrrrpKNptNq1at0uHDh3XixIl61++r7du3KykpSSkpKcrIyFBJSYlKSkp0+PBhT5+HHnpIe/fu1WOPPaadO3fqjTfe0JIlSzRx4kRPn7Fjx+rdd9/VggUL1KJFC89+Tp48KUmKjIzUH/7wB2VkZOiTTz5RYWGhRo8ercTERPXp06fe9QNNislHeQDUTW2P3Z6p+tHYyMhIKywszEpNTa310dj333/fuvrqqy2Hw2ENGjTI+v777732M3nyZCs6OtqKjIy0Jk6caI0bN+6cj/YuWLDA6tSpk+VwOKzExETrww8/tCRZX3zxhafPc889Z7Vt29ay2WznfbT3fPWfacWKFda5vsKmTJlS49FiSdZVV13l1e+TTz6xevbsaYWGhlpXX321NW/ePK/1te1Dkle/kydPWo888oh1+eWXW+Hh4dbdd99tHTx48Ky1AfBms6w63HQF0Ki9/fbbmjBhQo03oQLApYDbNAAAwCjCCAAAMIrbNAAAwCiujAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACj/j9m/Wv3UvEjmQAAAABJRU5ErkJggg==",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries.boxplot(column=\"Population 2020\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Отсечение данных для признака Возраст, значение которых больше 65 лет"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country (or dependency)</th>\n",
|
||
" <th>Population 2020</th>\n",
|
||
" <th>Population Clip</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country (or dependency) Population 2020 Population Clip\n",
|
||
"no \n",
|
||
"1 China 1439323776 50000000\n",
|
||
"2 India 1380004385 50000000\n",
|
||
"3 United States 331002651 50000000\n",
|
||
"4 Indonesia 273523615 50000000\n",
|
||
"5 Pakistan 220892340 50000000\n",
|
||
"6 Brazil 212559417 50000000\n",
|
||
"7 Nigeria 206139589 50000000\n",
|
||
"8 Bangladesh 164689383 50000000\n",
|
||
"9 Russia 145934462 50000000\n",
|
||
"10 Mexico 128932753 50000000\n",
|
||
"11 Japan 126476461 50000000\n",
|
||
"12 Ethiopia 114963588 50000000\n",
|
||
"13 Philippines 109581078 50000000\n",
|
||
"14 Egypt 102334404 50000000\n",
|
||
"15 Vietnam 97338579 50000000\n",
|
||
"16 DR Congo 89561403 50000000\n",
|
||
"17 Turkey 84339067 50000000\n",
|
||
"18 Iran 83992949 50000000\n",
|
||
"19 Germany 83783942 50000000\n",
|
||
"20 Thailand 69799978 50000000\n",
|
||
"21 United Kingdom 67886011 50000000\n",
|
||
"22 France 65273511 50000000\n",
|
||
"23 Italy 60461826 50000000\n",
|
||
"24 Tanzania 59734218 50000000\n",
|
||
"25 South Africa 59308690 50000000\n",
|
||
"26 Myanmar 54409800 50000000\n",
|
||
"27 Kenya 53771296 50000000\n",
|
||
"28 South Korea 51269185 50000000\n",
|
||
"29 Colombia 50882891 50000000"
|
||
]
|
||
},
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"countries_norm = countries.copy()\n",
|
||
"\n",
|
||
"countries_norm[\"Population Clip\"] = countries_norm[\"Population 2020\"].clip(0, 50000000);\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n",
|
||
" [\"Country (or dependency)\", \"Population 2020\", \"Population Clip\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Винсоризация признака Возраст"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"111195830.99999991\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country (or dependency)</th>\n",
|
||
" <th>Population 2020</th>\n",
|
||
" <th>PopulationWinsorized</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Brazil</td>\n",
|
||
" <td>212559417</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Nigeria</td>\n",
|
||
" <td>206139589</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Bangladesh</td>\n",
|
||
" <td>164689383</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Russia</td>\n",
|
||
" <td>145934462</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Mexico</td>\n",
|
||
" <td>128932753</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Japan</td>\n",
|
||
" <td>126476461</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Ethiopia</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" <td>114963588</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>Philippines</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" <td>109581078</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Egypt</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" <td>102334404</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Vietnam</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" <td>97338579</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>DR Congo</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" <td>89561403</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>Turkey</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" <td>84339067</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Iran</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" <td>83992949</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Germany</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" <td>83783942</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Thailand</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" <td>69799978</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>United Kingdom</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" <td>67886011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>France</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" <td>65273511</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Italy</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" <td>60461826</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Tanzania</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" <td>59734218</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>South Africa</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" <td>59308690</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Myanmar</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" <td>54409800</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Kenya</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" <td>53771296</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>South Korea</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" <td>51269185</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Colombia</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" <td>50882891</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country (or dependency) Population 2020 PopulationWinsorized\n",
|
||
"no \n",
|
||
"1 China 1439323776 114963588\n",
|
||
"2 India 1380004385 114963588\n",
|
||
"3 United States 331002651 114963588\n",
|
||
"4 Indonesia 273523615 114963588\n",
|
||
"5 Pakistan 220892340 114963588\n",
|
||
"6 Brazil 212559417 114963588\n",
|
||
"7 Nigeria 206139589 114963588\n",
|
||
"8 Bangladesh 164689383 114963588\n",
|
||
"9 Russia 145934462 114963588\n",
|
||
"10 Mexico 128932753 114963588\n",
|
||
"11 Japan 126476461 114963588\n",
|
||
"12 Ethiopia 114963588 114963588\n",
|
||
"13 Philippines 109581078 109581078\n",
|
||
"14 Egypt 102334404 102334404\n",
|
||
"15 Vietnam 97338579 97338579\n",
|
||
"16 DR Congo 89561403 89561403\n",
|
||
"17 Turkey 84339067 84339067\n",
|
||
"18 Iran 83992949 83992949\n",
|
||
"19 Germany 83783942 83783942\n",
|
||
"20 Thailand 69799978 69799978\n",
|
||
"21 United Kingdom 67886011 67886011\n",
|
||
"22 France 65273511 65273511\n",
|
||
"23 Italy 60461826 60461826\n",
|
||
"24 Tanzania 59734218 59734218\n",
|
||
"25 South Africa 59308690 59308690\n",
|
||
"26 Myanmar 54409800 54409800\n",
|
||
"27 Kenya 53771296 53771296\n",
|
||
"28 South Korea 51269185 51269185\n",
|
||
"29 Colombia 50882891 50882891"
|
||
]
|
||
},
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.stats.mstats import winsorize\n",
|
||
"\n",
|
||
"print(countries_norm[\"Population 2020\"].quantile(q=0.95))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
|
||
" countries_norm[\"Population 2020\"].fillna(countries_norm[\"Population 2020\"].mean()),\n",
|
||
" (0, 0.05),\n",
|
||
" inplace=False,\n",
|
||
")\n",
|
||
"\n",
|
||
"countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n",
|
||
" [\"Country (or dependency)\", \"Population 2020\", \"PopulationWinsorized\"]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Нормализация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country (or dependency)</th>\n",
|
||
" <th>Population 2020</th>\n",
|
||
" <th>PopulationNorm</th>\n",
|
||
" <th>PopulationClipNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm</th>\n",
|
||
" <th>PopulationWinsorizedNorm2</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>1.000000e+00</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.587866e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.299705e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.900357e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.534691e-01</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>2.911786e-06</td>\n",
|
||
" <td>0.000084</td>\n",
|
||
" <td>0.000036</td>\n",
|
||
" <td>-0.999927</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>1.861292e-06</td>\n",
|
||
" <td>0.000054</td>\n",
|
||
" <td>0.000023</td>\n",
|
||
" <td>-0.999953</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>5.731862e-07</td>\n",
|
||
" <td>0.000017</td>\n",
|
||
" <td>0.000007</td>\n",
|
||
" <td>-0.999986</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>3.862927e-07</td>\n",
|
||
" <td>0.000011</td>\n",
|
||
" <td>0.000005</td>\n",
|
||
" <td>-0.999990</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country (or dependency) Population 2020 PopulationNorm \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 1.000000e+00 \n",
|
||
"2 India 1380004385 9.587866e-01 \n",
|
||
"3 United States 331002651 2.299705e-01 \n",
|
||
"4 Indonesia 273523615 1.900357e-01 \n",
|
||
"5 Pakistan 220892340 1.534691e-01 \n",
|
||
".. ... ... ... \n",
|
||
"231 Montserrat 4992 2.911786e-06 \n",
|
||
"232 Falkland Islands 3480 1.861292e-06 \n",
|
||
"233 Niue 1626 5.731862e-07 \n",
|
||
"234 Tokelau 1357 3.862927e-07 \n",
|
||
"235 Holy See 801 0.000000e+00 \n",
|
||
"\n",
|
||
" PopulationClipNorm PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
|
||
"no \n",
|
||
"1 1.000000 1.000000 1.000000 \n",
|
||
"2 1.000000 1.000000 1.000000 \n",
|
||
"3 1.000000 1.000000 1.000000 \n",
|
||
"4 1.000000 1.000000 1.000000 \n",
|
||
"5 1.000000 1.000000 1.000000 \n",
|
||
".. ... ... ... \n",
|
||
"231 0.000084 0.000036 -0.999927 \n",
|
||
"232 0.000054 0.000023 -0.999953 \n",
|
||
"233 0.000017 0.000007 -0.999986 \n",
|
||
"234 0.000011 0.000005 -0.999990 \n",
|
||
"235 0.000000 0.000000 -1.000000 \n",
|
||
"\n",
|
||
"[235 rows x 6 columns]"
|
||
]
|
||
},
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
||
"\n",
|
||
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population 2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population Clip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country (or dependency)\",\n",
|
||
" \"Population 2020\",\n",
|
||
" \"PopulationNorm\",\n",
|
||
" \"PopulationClipNorm\",\n",
|
||
" \"PopulationWinsorizedNorm\",\n",
|
||
" \"PopulationWinsorizedNorm2\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Стандартизация значений"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Country (or dependency)</th>\n",
|
||
" <th>Population 2020</th>\n",
|
||
" <th>PopulationStand</th>\n",
|
||
" <th>PopulationClipStand</th>\n",
|
||
" <th>PopulationWinsorizedStand</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>no</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>China</td>\n",
|
||
" <td>1439323776</td>\n",
|
||
" <td>10.427597</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>India</td>\n",
|
||
" <td>1380004385</td>\n",
|
||
" <td>9.987702</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>United States</td>\n",
|
||
" <td>331002651</td>\n",
|
||
" <td>2.208627</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Indonesia</td>\n",
|
||
" <td>273523615</td>\n",
|
||
" <td>1.782380</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Pakistan</td>\n",
|
||
" <td>220892340</td>\n",
|
||
" <td>1.392082</td>\n",
|
||
" <td>2.073933</td>\n",
|
||
" <td>3.171659</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>231</th>\n",
|
||
" <td>Montserrat</td>\n",
|
||
" <td>4992</td>\n",
|
||
" <td>-0.245950</td>\n",
|
||
" <td>-0.795071</td>\n",
|
||
" <td>-0.621969</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>232</th>\n",
|
||
" <td>Falkland Islands</td>\n",
|
||
" <td>3480</td>\n",
|
||
" <td>-0.245962</td>\n",
|
||
" <td>-0.795158</td>\n",
|
||
" <td>-0.622019</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>233</th>\n",
|
||
" <td>Niue</td>\n",
|
||
" <td>1626</td>\n",
|
||
" <td>-0.245975</td>\n",
|
||
" <td>-0.795265</td>\n",
|
||
" <td>-0.622080</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>234</th>\n",
|
||
" <td>Tokelau</td>\n",
|
||
" <td>1357</td>\n",
|
||
" <td>-0.245977</td>\n",
|
||
" <td>-0.795280</td>\n",
|
||
" <td>-0.622089</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>235</th>\n",
|
||
" <td>Holy See</td>\n",
|
||
" <td>801</td>\n",
|
||
" <td>-0.245982</td>\n",
|
||
" <td>-0.795312</td>\n",
|
||
" <td>-0.622107</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>235 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Country (or dependency) Population 2020 PopulationStand \\\n",
|
||
"no \n",
|
||
"1 China 1439323776 10.427597 \n",
|
||
"2 India 1380004385 9.987702 \n",
|
||
"3 United States 331002651 2.208627 \n",
|
||
"4 Indonesia 273523615 1.782380 \n",
|
||
"5 Pakistan 220892340 1.392082 \n",
|
||
".. ... ... ... \n",
|
||
"231 Montserrat 4992 -0.245950 \n",
|
||
"232 Falkland Islands 3480 -0.245962 \n",
|
||
"233 Niue 1626 -0.245975 \n",
|
||
"234 Tokelau 1357 -0.245977 \n",
|
||
"235 Holy See 801 -0.245982 \n",
|
||
"\n",
|
||
" PopulationClipStand PopulationWinsorizedStand \n",
|
||
"no \n",
|
||
"1 2.073933 3.171659 \n",
|
||
"2 2.073933 3.171659 \n",
|
||
"3 2.073933 3.171659 \n",
|
||
"4 2.073933 3.171659 \n",
|
||
"5 2.073933 3.171659 \n",
|
||
".. ... ... \n",
|
||
"231 -0.795071 -0.621969 \n",
|
||
"232 -0.795158 -0.622019 \n",
|
||
"233 -0.795265 -0.622080 \n",
|
||
"234 -0.795280 -0.622089 \n",
|
||
"235 -0.795312 -0.622107 \n",
|
||
"\n",
|
||
"[235 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn import preprocessing\n",
|
||
"\n",
|
||
"stndart_scaler = preprocessing.StandardScaler()\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population 2020\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"Population Clip\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
|
||
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
||
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
||
"\n",
|
||
"countries_norm[\n",
|
||
" [\n",
|
||
" \"Country (or dependency)\",\n",
|
||
" \"Population 2020\",\n",
|
||
" \"PopulationStand\",\n",
|
||
" \"PopulationClipStand\",\n",
|
||
" \"PopulationWinsorizedStand\",\n",
|
||
" ]\n",
|
||
"]"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|