1901 lines
78 KiB
Plaintext
1901 lines
78 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Унитарное кодирование\n",
|
|||
|
"\n",
|
|||
|
"Преобразование категориального признака в несколько бинарных признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Загрузка набора дынных, преобразование данных в числовой формат."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"countries = pd.read_csv(\n",
|
|||
|
" \"data/population.csv\", index_col=\"no\"\n",
|
|||
|
")\n",
|
|||
|
"#преобразуем данные в числовой формат, удаляем запятые\n",
|
|||
|
"countries[\"Population 2020\"] = countries[\"Population 2020\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"countries[\"Net Change\"] = countries[\"Net Change\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"countries[\"Yearly Change\"] = countries[\"Yearly Change\"].apply(\n",
|
|||
|
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
|||
|
")\n",
|
|||
|
"countries[\"Land Area (Km²)\"] = countries[\"Land Area (Km²)\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"countries"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Кодирование"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
|
|||
|
"\n",
|
|||
|
"# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
|
|||
|
"\n",
|
|||
|
"# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"\n",
|
|||
|
"# encoded_values_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Добавление признаков в исходный Dataframe"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# titanic"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Дискретизация признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Равномерное разделение данных на 3 группы. первый вывод - ограничения по площади, второй - колво стран в каждой группе\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
|||
|
"num_bins = 3"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"hist1, bins1 = np.histogram(\n",
|
|||
|
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins=num_bins\n",
|
|||
|
")\n",
|
|||
|
"bins1, hist1"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.cut(countries[\"Land Area (Km²)\"], list(bins1)),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.cut(countries[\"Land Area (Km²)\"], list(bins1), labels=labels),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
|
|||
|
"bins2 = np.linspace(0, 12000000, 4)\n",
|
|||
|
"\n",
|
|||
|
"tmp_bins2 = np.digitize(\n",
|
|||
|
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins2\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"hist2 = np.bincount(tmp_bins2 - 1)\n",
|
|||
|
"\n",
|
|||
|
"bins2, hist2"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.cut(countries[\"Land Area (Km²)\"], list(bins2)),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.cut(countries[\"Land Area (Km²)\"], list(bins2), labels=labels),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
|
|||
|
"hist3, bins3 = np.histogram(\n",
|
|||
|
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()),\n",
|
|||
|
" bins=[0, 1000, 100000, 500000, 3000000, np.inf],\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"bins3, hist3"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.cut(countries[\"Land Area (Km²)\"], list(bins3)),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.cut(countries[\"Land Area (Km²)\"], list(bins3), labels=labels2),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Квантильное разделение данных на 5 групп\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.qcut(countries[\"Land Area (Km²)\"], q=5, labels=False),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"pd.concat(\n",
|
|||
|
" [\n",
|
|||
|
" countries[\"Country (or dependency)\"],\n",
|
|||
|
" countries[\"Land Area (Km²)\"],\n",
|
|||
|
" pd.qcut(countries[\"Land Area (Km²)\"], q=5, labels=labels2),\n",
|
|||
|
" ],\n",
|
|||
|
" axis=1,\n",
|
|||
|
").head(20)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Пример конструирования признаков на основе существующих\n",
|
|||
|
"\n",
|
|||
|
"Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
|
|||
|
"\n",
|
|||
|
"Is_married - замужняя ли женщина\n",
|
|||
|
"\n",
|
|||
|
"Cabin_type - палуба (тип каюты)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# titanic_cl = titanic.drop(\n",
|
|||
|
"# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
|
|||
|
"# )\n",
|
|||
|
"# titanic_cl = titanic_cl.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# titanic_cl[\"Title\"] = [\n",
|
|||
|
"# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
|
|||
|
"# ]\n",
|
|||
|
"\n",
|
|||
|
"# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
|
|||
|
"\n",
|
|||
|
"# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
|
|||
|
"\n",
|
|||
|
"# titanic_cl"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
|
|||
|
"\n",
|
|||
|
"https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Загрузка данных\n",
|
|||
|
"\n",
|
|||
|
"За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
|
|||
|
"\n",
|
|||
|
"Используется только 100 первых заказов и связанные с ними объекты\n",
|
|||
|
"\n",
|
|||
|
"https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"( no Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
|
|||
|
" 0 1 China 1439323776 0.39 5540090 \n",
|
|||
|
" 1 2 India 1380004385 0.99 13586631 \n",
|
|||
|
" 2 3 United States 331002651 0.59 1937734 \n",
|
|||
|
" 3 4 Indonesia 273523615 1.07 2898047 \n",
|
|||
|
" 4 5 Pakistan 220892340 2.00 4327022 \n",
|
|||
|
" .. ... ... ... ... ... \n",
|
|||
|
" 230 231 Montserrat 4992 0.06 3 \n",
|
|||
|
" 231 232 Falkland Islands 3480 3.05 103 \n",
|
|||
|
" 232 233 Niue 1626 0.68 11 \n",
|
|||
|
" 233 234 Tokelau 1357 1.27 17 \n",
|
|||
|
" 234 235 Holy See 801 0.25 2 \n",
|
|||
|
" \n",
|
|||
|
" Density(P/Km²) Land Area (Km²) \n",
|
|||
|
" 0 153 9388211 \n",
|
|||
|
" 1 464 2973190 \n",
|
|||
|
" 2 36 9147420 \n",
|
|||
|
" 3 151 1811570 \n",
|
|||
|
" 4 287 770880 \n",
|
|||
|
" .. ... ... \n",
|
|||
|
" 230 50 100 \n",
|
|||
|
" 231 0 12170 \n",
|
|||
|
" 232 6 260 \n",
|
|||
|
" 233 136 10 \n",
|
|||
|
" 234 2,003 0 \n",
|
|||
|
" \n",
|
|||
|
" [235 rows x 7 columns],\n",
|
|||
|
" Year Population YearlyPer Yearly Median Fertility Density\n",
|
|||
|
" 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
|
|||
|
" 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
|
|||
|
" 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
|
|||
|
" 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
|
|||
|
" 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
|
|||
|
" 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
|
|||
|
" 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
|
|||
|
" Country/Territory Capital Continent\n",
|
|||
|
" 0 Afghanistan Kabul Asia\n",
|
|||
|
" 1 Albania Tirana Europe\n",
|
|||
|
" 2 Algeria Algiers Africa\n",
|
|||
|
" 3 American Samoa Pago Pago Oceania\n",
|
|||
|
" 4 Andorra Andorra la Vella Europe\n",
|
|||
|
" .. ... ... ...\n",
|
|||
|
" 229 Wallis and Futuna Mata-Utu Oceania\n",
|
|||
|
" 230 Western Sahara El Aain Africa\n",
|
|||
|
" 231 Yemen Sanaa Asia\n",
|
|||
|
" 232 Zambia Lusaka Africa\n",
|
|||
|
" 233 Zimbabwe Harare Africa\n",
|
|||
|
" \n",
|
|||
|
" [234 rows x 3 columns])"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from woodwork.logical_types import Categorical, Datetime\n",
|
|||
|
"\n",
|
|||
|
"info = pd.read_csv(\"data/population.csv\")\n",
|
|||
|
"forcast = pd.read_csv(\"data/forcast.csv\")\n",
|
|||
|
"capitals = pd.read_csv(\"data/country.csv\", encoding=\"ISO-8859-1\")\n",
|
|||
|
"forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
|
|||
|
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
|||
|
")\n",
|
|||
|
"forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"info = info.drop(\n",
|
|||
|
" [\"Migrants (net)\", \"Fert. Rate\", \"MedAge\", \"Urban Pop %\", \"World Share\"], axis=1\n",
|
|||
|
")\n",
|
|||
|
"info[\"Population 2020\"] = info[\"Population 2020\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"info[\"Yearly Change\"] = info[\"Yearly Change\"].apply(\n",
|
|||
|
" lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
|
|||
|
")\n",
|
|||
|
"info[\"Net Change\"] = info[\"Net Change\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"info[\"Land Area (Km²)\"] = info[\"Land Area (Km²)\"].apply(\n",
|
|||
|
" lambda x: int(\"\".join(x.split(\",\")))\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"info, forcast, capitals"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Создание сущностей в featuretools\n",
|
|||
|
"\n",
|
|||
|
"Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Entityset: countries\n",
|
|||
|
" DataFrames:\n",
|
|||
|
" countries [Rows: 235, Columns: 7]\n",
|
|||
|
" capitals [Rows: 234, Columns: 3]\n",
|
|||
|
" forcast [Rows: 7, Columns: 8]\n",
|
|||
|
" Relationships:\n",
|
|||
|
" No relationships"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"es = ft.EntitySet(id=\"countries\")\n",
|
|||
|
"\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"countries\",\n",
|
|||
|
" dataframe=info,\n",
|
|||
|
" index=\"no\",\n",
|
|||
|
" logical_types={\n",
|
|||
|
" \"Country (or dependency)\": Categorical,\n",
|
|||
|
" },\n",
|
|||
|
")\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"capitals\",\n",
|
|||
|
" dataframe=capitals,\n",
|
|||
|
" index=\"Country/Territory\",\n",
|
|||
|
" logical_types={\n",
|
|||
|
" \"Country/Territory\": Categorical,\n",
|
|||
|
" \"Capital\": Categorical,\n",
|
|||
|
" \"Continent\": Categorical,\n",
|
|||
|
" },\n",
|
|||
|
")\n",
|
|||
|
"es = es.add_dataframe(\n",
|
|||
|
" dataframe_name=\"forcast\",\n",
|
|||
|
" dataframe=forcast,\n",
|
|||
|
" index=\"forcast_id\",\n",
|
|||
|
" make_index=True,\n",
|
|||
|
" logical_types={\n",
|
|||
|
" \"Year\": Datetime,\n",
|
|||
|
" },\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"es"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Настройка связей между сущностями featuretools\n",
|
|||
|
"\n",
|
|||
|
"Настройка связей между таблицами на уровне ключей\n",
|
|||
|
"\n",
|
|||
|
"Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"Entityset: countries\n",
|
|||
|
" DataFrames:\n",
|
|||
|
" countries [Rows: 235, Columns: 7]\n",
|
|||
|
" capitals [Rows: 234, Columns: 3]\n",
|
|||
|
" forcast [Rows: 7, Columns: 8]\n",
|
|||
|
" Relationships:\n",
|
|||
|
" countries.Country (or dependency) -> capitals.Country/Territory"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"es = es.add_relationship(\n",
|
|||
|
" \"capitals\", \"Country/Territory\", \"countries\", \"Country (or dependency)\"\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"es"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Автоматическое конструирование признаков с помощью featuretools\n",
|
|||
|
"\n",
|
|||
|
"Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
|
|||
|
"\n",
|
|||
|
"Результат помещается в Dataframe feature_matrix"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Country (or dependency)</th>\n",
|
|||
|
" <th>Population 2020</th>\n",
|
|||
|
" <th>Yearly Change</th>\n",
|
|||
|
" <th>Net Change</th>\n",
|
|||
|
" <th>Land Area (Km²)</th>\n",
|
|||
|
" <th>capitals.Capital</th>\n",
|
|||
|
" <th>capitals.Continent</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>China</td>\n",
|
|||
|
" <td>1439323776</td>\n",
|
|||
|
" <td>0.39</td>\n",
|
|||
|
" <td>5540090</td>\n",
|
|||
|
" <td>9388211</td>\n",
|
|||
|
" <td>Beijing</td>\n",
|
|||
|
" <td>Asia</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>India</td>\n",
|
|||
|
" <td>1380004385</td>\n",
|
|||
|
" <td>0.99</td>\n",
|
|||
|
" <td>13586631</td>\n",
|
|||
|
" <td>2973190</td>\n",
|
|||
|
" <td>New Delhi</td>\n",
|
|||
|
" <td>Asia</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>331002651</td>\n",
|
|||
|
" <td>0.59</td>\n",
|
|||
|
" <td>1937734</td>\n",
|
|||
|
" <td>9147420</td>\n",
|
|||
|
" <td>Washington, D.C.</td>\n",
|
|||
|
" <td>North America</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Indonesia</td>\n",
|
|||
|
" <td>273523615</td>\n",
|
|||
|
" <td>1.07</td>\n",
|
|||
|
" <td>2898047</td>\n",
|
|||
|
" <td>1811570</td>\n",
|
|||
|
" <td>Jakarta</td>\n",
|
|||
|
" <td>Asia</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>Pakistan</td>\n",
|
|||
|
" <td>220892340</td>\n",
|
|||
|
" <td>2.00</td>\n",
|
|||
|
" <td>4327022</td>\n",
|
|||
|
" <td>770880</td>\n",
|
|||
|
" <td>Islamabad</td>\n",
|
|||
|
" <td>Asia</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>231</th>\n",
|
|||
|
" <td>Montserrat</td>\n",
|
|||
|
" <td>4992</td>\n",
|
|||
|
" <td>0.06</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>Brades</td>\n",
|
|||
|
" <td>North America</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>232</th>\n",
|
|||
|
" <td>Falkland Islands</td>\n",
|
|||
|
" <td>3480</td>\n",
|
|||
|
" <td>3.05</td>\n",
|
|||
|
" <td>103</td>\n",
|
|||
|
" <td>12170</td>\n",
|
|||
|
" <td>Stanley</td>\n",
|
|||
|
" <td>South America</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>233</th>\n",
|
|||
|
" <td>Niue</td>\n",
|
|||
|
" <td>1626</td>\n",
|
|||
|
" <td>0.68</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>260</td>\n",
|
|||
|
" <td>Alofi</td>\n",
|
|||
|
" <td>Oceania</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>234</th>\n",
|
|||
|
" <td>Tokelau</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" <td>1.27</td>\n",
|
|||
|
" <td>17</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>Nukunonu</td>\n",
|
|||
|
" <td>Oceania</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>235</th>\n",
|
|||
|
" <td>Holy See</td>\n",
|
|||
|
" <td>801</td>\n",
|
|||
|
" <td>0.25</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>235 rows × 7 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
|
|||
|
"no \n",
|
|||
|
"1 China 1439323776 0.39 5540090 \n",
|
|||
|
"2 India 1380004385 0.99 13586631 \n",
|
|||
|
"3 United States 331002651 0.59 1937734 \n",
|
|||
|
"4 Indonesia 273523615 1.07 2898047 \n",
|
|||
|
"5 Pakistan 220892340 2.00 4327022 \n",
|
|||
|
".. ... ... ... ... \n",
|
|||
|
"231 Montserrat 4992 0.06 3 \n",
|
|||
|
"232 Falkland Islands 3480 3.05 103 \n",
|
|||
|
"233 Niue 1626 0.68 11 \n",
|
|||
|
"234 Tokelau 1357 1.27 17 \n",
|
|||
|
"235 Holy See 801 0.25 2 \n",
|
|||
|
"\n",
|
|||
|
" Land Area (Km²) capitals.Capital capitals.Continent \n",
|
|||
|
"no \n",
|
|||
|
"1 9388211 Beijing Asia \n",
|
|||
|
"2 2973190 New Delhi Asia \n",
|
|||
|
"3 9147420 Washington, D.C. North America \n",
|
|||
|
"4 1811570 Jakarta Asia \n",
|
|||
|
"5 770880 Islamabad Asia \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"231 100 Brades North America \n",
|
|||
|
"232 12170 Stanley South America \n",
|
|||
|
"233 260 Alofi Oceania \n",
|
|||
|
"234 10 Nukunonu Oceania \n",
|
|||
|
"235 0 NaN NaN \n",
|
|||
|
"\n",
|
|||
|
"[235 rows x 7 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es,\n",
|
|||
|
" target_dataframe_name=\"countries\",\n",
|
|||
|
" max_depth=1,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Полученные признаки\n",
|
|||
|
"\n",
|
|||
|
"Список колонок полученного dataframe'а"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"[<Feature: Country (or dependency)>,\n",
|
|||
|
" <Feature: Population 2020>,\n",
|
|||
|
" <Feature: Yearly Change>,\n",
|
|||
|
" <Feature: Net Change>,\n",
|
|||
|
" <Feature: Land Area (Km²)>,\n",
|
|||
|
" <Feature: capitals.Capital>,\n",
|
|||
|
" <Feature: capitals.Continent>]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"feature_defs"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Отсечение значений признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Определение выбросов с помощью boxplot"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Axes: >"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGsCAYAAAAPJKchAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvbklEQVR4nO3df1RVdb7/8dfhAEdIwQzFH2FalNqVlCwRG1NSIC0m8jY52VXzTs630lkqeSv6oVmT9AMdrWz81oyWt/ydWqOkcimVEvMrRuWkpqkxmaDW0qNgcDzs7x8uzvUEKgfxfOTwfKzlWuzP/uy936e19jmv9v7s/bFZlmUJAADAkCDTBQAAgKaNMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMalRhZOPGjUpLS1P79u1ls9m0cuVKn/exZMkS9ezZU+Hh4brqqqv0yiuvNHyhAACgzhpVGCkrK1OPHj00e/bsem3/0Ucf6f7779dDDz2k7du364033tBf/vIXvf766w1cKQAAqCtbY50oz2azacWKFUpPT/e0VVRU6KmnntLChQt19OhRde/eXS+99JIGDBggSRo+fLhcLpeWLl3q2ea1117Tyy+/rOLiYtlsNj9/CgAA0KiujJzPuHHjVFBQoEWLFumrr77S7373O91+++3avXu3pNNhpVmzZl7bhIWF6YcfftD3339vomQAAJq8gAkjxcXFmjdvnpYuXap+/frpmmuu0aRJk/Sb3/xG8+bNkySlpqZq+fLlysvLU1VVlb799ltNnz5dknTw4EGT5QMA0GQFmy6goXz99ddyu9267rrrvNorKip0xRVXSJLGjBmj7777TnfeeadcLpciIiI0fvx4PfvsswoKCphcBgBAoxIwYeTEiROy2+0qLCyU3W73Wte8eXNJp8eZvPTSS5o2bZpKSkrUunVr5eXlSZKuvvpqv9cMAAACKIzEx8fL7Xbr0KFD6tev3zn72u12dejQQZK0cOFCJSYmqnXr1v4oEwAA/EqjCiMnTpzQnj17PMv79u1TUVGRWrVqpeuuu07333+/Ro4cqenTpys+Pl6HDx9WXl6ebrjhBt1xxx06cuSIli1bpgEDBuiXX37xjDHZsGGDwU8FAEDT1qge7V2/fr2SkpJqtI8aNUpvv/22XC6X/vznP2v+/Pk6cOCAoqKi1KdPH02dOlVxcXE6cuSI0tLS9PXXX8uyLCUmJuqFF15QQkKCgU8DAACkRhZGAABA4OEREgAAYBRhBAAAGNUoBrBWVVXpxx9/VIsWLXhlOwAAjYRlWTp+/Ljat29/zvd5NYow8uOPPyomJsZ0GQAAoB7+9a9/6corrzzr+kYRRlq0aCHp9IeJiIgwXA2AhuRyubRu3TqlpKQoJCTEdDkAGpDT6VRMTIznd/xsGkUYqb41ExERQRgBAozL5VJ4eLgiIiIII0CAOt8QCwawAgAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAwxu12a8OGDdq4caM2bNggt9ttuiQABhBGABixfPlyxcbGKjk5WTNmzFBycrJiY2O1fPly06UB8DPCCAC/W758ue655x7FxcUpPz9fCxcuVH5+vuLi4nTPPfcQSIAmxmZZlmW6iPNxOp2KjIzUsWPHmJsGaOTcbrdiY2MVFxenlStXyu12KycnR0OGDJHdbld6erq2b9+u3bt3y263my4XwAWo6+83V0YA+FV+fr7279+vJ598UkFB3l9BQUFByszM1L59+5Sfn2+oQgD+RhgB4FcHDx6UJHXv3r3W9dXt1f0ABD7CCAC/ateunSRp+/btta6vbq/uByDwEUYA+FW/fv3UqVMnTZs2TVVVVV7rqqqqlJWVpc6dO6tfv36GKgTgb4QRAH5lt9s1ffp0rVq1Sunp6dq8ebNOnjypzZs3Kz09XatWrVJ2djaDV4EmJNh0AQCanqFDh2rZsmV69NFHdeutt3raO3furGXLlmno0KEGqwPgbzzaC8AYt9utTz75RB999JEGDx6spKQkrogAAeSiPdq7ceNGpaWlqX379rLZbFq5cmWdt/3ss88UHBysnj17+npYAAHIbrerf//+uvXWW9W/f3+CCNBE+RxGysrK1KNHD82ePdun7Y4ePaqRI0dq4MCBvh4SAAAEMJ/HjAwePFiDBw/2+UAPPfSQhg8fLrvd7tPVFAAAENj8MoB13rx52rt3r9599139+c9/Pm//iooKVVRUeJadTqckyeVyyeVyXbQ6AfiX2+3W+vXrtXHjRjkcDg0YMIBbNUAAqetv9kUPI7t379YTTzyh/Px8BQfX7XBZWVmaOnVqjfZ169YpPDy8oUsEYEBBQYHmzZunQ4cOSZJmzJihNm3aaPTo0UpMTDRcHYCGUF5eXqd+FzWMuN1uDR8+XFOnTtV1111X5+0yMzOVkZHhWXY6nYqJiVFKSgpP0wABYMWKFXr55Zc1ZMgQTZo0SSUlJWrbtq2ys7P18ssva9GiRbr77rtNlwngAlXf2TifC3q012azacWKFUpPT691/dGjR3X55Zd7XXatqqqSZVmy2+1at26dbrvttvMeh0d7gcDBrL1A01HX3++LemUkIiJCX3/9tVfbG2+8oY8//ljLli1T586dL+bhAVyCqmftXbhwoYKCguR2uz3rqmft7du3r/Lz8zVgwABzhQLwG5/DyIkTJ7Rnzx7P8r59+1RUVKRWrVqpY8eOyszM1IEDBzR//nwFBQXVmJmzTZs2atas2Vln7AQQ2Ji1F8Cv+fyeka1btyo+Pl7x8fGSpIyMDMXHx2vy5MmSTn+BFBcXN2yVAAIGs/YC+DVeBw/ArxgzAjQdF+118ABwIZi1F8CvMWsvAL9j1l4AZ+I2DQBjmLUXCGyXxKO9AHAu1bP2lpWVMWsv0IQxZgQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUT6HkY0bNyotLU3t27eXzWbTypUrz9l/+fLlSk5OVuvWrRUREaHExEStXbu2vvUCAIAA43MYKSsrU48ePTR79uw69d+4caOSk5OVk5OjwsJCJSUlKS0tTV988YXPxQIAgMAT7OsGgwcP1uDBg+vcf+bMmV7L06ZN0wcffKB//OMfio+P9/XwAAAgwPgcRi5UVVWVjh8/rlatWp21T0VFhSoqKjzLTqdTkuRyueRyuS56jQD8p/qc5twGAk9dz2u/h5Hs7GydOHFC995771n7ZGVlaerUqTXa161bp/Dw8ItZHgBDcnNzTZcAoIGVl5fXqZ/Nsiyrvgex2WxasWKF0tPT69R/wYIFGjNmjD744AMNGjTorP1quzISExOjI0eOKCIior7lArgEuVwu5ebmKjk5WSEhIabLAdCAnE6noqKidOzYsXP+fvvtysiiRYv04IMPaunSpecMIpLkcDjkcDhqtIeEhPBlBQQozm8g8NT1nPbLe0YWLlyo0aNHa+HChbrjjjv8cUgAANBI+Hxl5MSJE9qzZ49ned++fSoqKlKrVq3UsWNHZWZm6sCBA5o/f76k07dmRo0apVmzZikhIUElJSWSpLCwMEVGRjbQxwAAAI2Vz1dGtm7dqvj4eM9juRkZGYqPj9fkyZMlSQcPHlRxcbGn/5tvvqlTp05p7Nixateuneff+PHjG+gjAACAxsznKyMDBgzQuca8vv32217L69ev9/UQAACgCWFuGgAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRg
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"countries.boxplot(column=\"Population 2020\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Отсечение данных для признака Возраст, значение которых больше 65 лет"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 40,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Country (or dependency)</th>\n",
|
|||
|
" <th>Population 2020</th>\n",
|
|||
|
" <th>Population Clip</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>China</td>\n",
|
|||
|
" <td>1439323776</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>India</td>\n",
|
|||
|
" <td>1380004385</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>331002651</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Indonesia</td>\n",
|
|||
|
" <td>273523615</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>Pakistan</td>\n",
|
|||
|
" <td>220892340</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>Brazil</td>\n",
|
|||
|
" <td>212559417</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>Nigeria</td>\n",
|
|||
|
" <td>206139589</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>Bangladesh</td>\n",
|
|||
|
" <td>164689383</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>Russia</td>\n",
|
|||
|
" <td>145934462</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>Mexico</td>\n",
|
|||
|
" <td>128932753</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>Japan</td>\n",
|
|||
|
" <td>126476461</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>Ethiopia</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>Philippines</td>\n",
|
|||
|
" <td>109581078</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>Egypt</td>\n",
|
|||
|
" <td>102334404</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>Vietnam</td>\n",
|
|||
|
" <td>97338579</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>DR Congo</td>\n",
|
|||
|
" <td>89561403</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>Turkey</td>\n",
|
|||
|
" <td>84339067</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>Iran</td>\n",
|
|||
|
" <td>83992949</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>Germany</td>\n",
|
|||
|
" <td>83783942</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>20</th>\n",
|
|||
|
" <td>Thailand</td>\n",
|
|||
|
" <td>69799978</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21</th>\n",
|
|||
|
" <td>United Kingdom</td>\n",
|
|||
|
" <td>67886011</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>22</th>\n",
|
|||
|
" <td>France</td>\n",
|
|||
|
" <td>65273511</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>23</th>\n",
|
|||
|
" <td>Italy</td>\n",
|
|||
|
" <td>60461826</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>24</th>\n",
|
|||
|
" <td>Tanzania</td>\n",
|
|||
|
" <td>59734218</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25</th>\n",
|
|||
|
" <td>South Africa</td>\n",
|
|||
|
" <td>59308690</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>26</th>\n",
|
|||
|
" <td>Myanmar</td>\n",
|
|||
|
" <td>54409800</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>27</th>\n",
|
|||
|
" <td>Kenya</td>\n",
|
|||
|
" <td>53771296</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28</th>\n",
|
|||
|
" <td>South Korea</td>\n",
|
|||
|
" <td>51269185</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>29</th>\n",
|
|||
|
" <td>Colombia</td>\n",
|
|||
|
" <td>50882891</td>\n",
|
|||
|
" <td>50000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Country (or dependency) Population 2020 Population Clip\n",
|
|||
|
"no \n",
|
|||
|
"1 China 1439323776 50000000\n",
|
|||
|
"2 India 1380004385 50000000\n",
|
|||
|
"3 United States 331002651 50000000\n",
|
|||
|
"4 Indonesia 273523615 50000000\n",
|
|||
|
"5 Pakistan 220892340 50000000\n",
|
|||
|
"6 Brazil 212559417 50000000\n",
|
|||
|
"7 Nigeria 206139589 50000000\n",
|
|||
|
"8 Bangladesh 164689383 50000000\n",
|
|||
|
"9 Russia 145934462 50000000\n",
|
|||
|
"10 Mexico 128932753 50000000\n",
|
|||
|
"11 Japan 126476461 50000000\n",
|
|||
|
"12 Ethiopia 114963588 50000000\n",
|
|||
|
"13 Philippines 109581078 50000000\n",
|
|||
|
"14 Egypt 102334404 50000000\n",
|
|||
|
"15 Vietnam 97338579 50000000\n",
|
|||
|
"16 DR Congo 89561403 50000000\n",
|
|||
|
"17 Turkey 84339067 50000000\n",
|
|||
|
"18 Iran 83992949 50000000\n",
|
|||
|
"19 Germany 83783942 50000000\n",
|
|||
|
"20 Thailand 69799978 50000000\n",
|
|||
|
"21 United Kingdom 67886011 50000000\n",
|
|||
|
"22 France 65273511 50000000\n",
|
|||
|
"23 Italy 60461826 50000000\n",
|
|||
|
"24 Tanzania 59734218 50000000\n",
|
|||
|
"25 South Africa 59308690 50000000\n",
|
|||
|
"26 Myanmar 54409800 50000000\n",
|
|||
|
"27 Kenya 53771296 50000000\n",
|
|||
|
"28 South Korea 51269185 50000000\n",
|
|||
|
"29 Colombia 50882891 50000000"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 40,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"countries_norm = countries.copy()\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"Population Clip\"] = countries_norm[\"Population 2020\"].clip(0, 50000000);\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n",
|
|||
|
" [\"Country (or dependency)\", \"Population 2020\", \"Population Clip\"]\n",
|
|||
|
"]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Винсоризация признака Возраст"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"111195830.99999991\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Country (or dependency)</th>\n",
|
|||
|
" <th>Population 2020</th>\n",
|
|||
|
" <th>PopulationWinsorized</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>China</td>\n",
|
|||
|
" <td>1439323776</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>India</td>\n",
|
|||
|
" <td>1380004385</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>331002651</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Indonesia</td>\n",
|
|||
|
" <td>273523615</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>Pakistan</td>\n",
|
|||
|
" <td>220892340</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>Brazil</td>\n",
|
|||
|
" <td>212559417</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>Nigeria</td>\n",
|
|||
|
" <td>206139589</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>Bangladesh</td>\n",
|
|||
|
" <td>164689383</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>Russia</td>\n",
|
|||
|
" <td>145934462</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>Mexico</td>\n",
|
|||
|
" <td>128932753</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>Japan</td>\n",
|
|||
|
" <td>126476461</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>Ethiopia</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" <td>114963588</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>Philippines</td>\n",
|
|||
|
" <td>109581078</td>\n",
|
|||
|
" <td>109581078</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>Egypt</td>\n",
|
|||
|
" <td>102334404</td>\n",
|
|||
|
" <td>102334404</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>Vietnam</td>\n",
|
|||
|
" <td>97338579</td>\n",
|
|||
|
" <td>97338579</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>DR Congo</td>\n",
|
|||
|
" <td>89561403</td>\n",
|
|||
|
" <td>89561403</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>Turkey</td>\n",
|
|||
|
" <td>84339067</td>\n",
|
|||
|
" <td>84339067</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>Iran</td>\n",
|
|||
|
" <td>83992949</td>\n",
|
|||
|
" <td>83992949</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>Germany</td>\n",
|
|||
|
" <td>83783942</td>\n",
|
|||
|
" <td>83783942</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>20</th>\n",
|
|||
|
" <td>Thailand</td>\n",
|
|||
|
" <td>69799978</td>\n",
|
|||
|
" <td>69799978</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21</th>\n",
|
|||
|
" <td>United Kingdom</td>\n",
|
|||
|
" <td>67886011</td>\n",
|
|||
|
" <td>67886011</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>22</th>\n",
|
|||
|
" <td>France</td>\n",
|
|||
|
" <td>65273511</td>\n",
|
|||
|
" <td>65273511</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>23</th>\n",
|
|||
|
" <td>Italy</td>\n",
|
|||
|
" <td>60461826</td>\n",
|
|||
|
" <td>60461826</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>24</th>\n",
|
|||
|
" <td>Tanzania</td>\n",
|
|||
|
" <td>59734218</td>\n",
|
|||
|
" <td>59734218</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25</th>\n",
|
|||
|
" <td>South Africa</td>\n",
|
|||
|
" <td>59308690</td>\n",
|
|||
|
" <td>59308690</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>26</th>\n",
|
|||
|
" <td>Myanmar</td>\n",
|
|||
|
" <td>54409800</td>\n",
|
|||
|
" <td>54409800</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>27</th>\n",
|
|||
|
" <td>Kenya</td>\n",
|
|||
|
" <td>53771296</td>\n",
|
|||
|
" <td>53771296</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28</th>\n",
|
|||
|
" <td>South Korea</td>\n",
|
|||
|
" <td>51269185</td>\n",
|
|||
|
" <td>51269185</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>29</th>\n",
|
|||
|
" <td>Colombia</td>\n",
|
|||
|
" <td>50882891</td>\n",
|
|||
|
" <td>50882891</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Country (or dependency) Population 2020 PopulationWinsorized\n",
|
|||
|
"no \n",
|
|||
|
"1 China 1439323776 114963588\n",
|
|||
|
"2 India 1380004385 114963588\n",
|
|||
|
"3 United States 331002651 114963588\n",
|
|||
|
"4 Indonesia 273523615 114963588\n",
|
|||
|
"5 Pakistan 220892340 114963588\n",
|
|||
|
"6 Brazil 212559417 114963588\n",
|
|||
|
"7 Nigeria 206139589 114963588\n",
|
|||
|
"8 Bangladesh 164689383 114963588\n",
|
|||
|
"9 Russia 145934462 114963588\n",
|
|||
|
"10 Mexico 128932753 114963588\n",
|
|||
|
"11 Japan 126476461 114963588\n",
|
|||
|
"12 Ethiopia 114963588 114963588\n",
|
|||
|
"13 Philippines 109581078 109581078\n",
|
|||
|
"14 Egypt 102334404 102334404\n",
|
|||
|
"15 Vietnam 97338579 97338579\n",
|
|||
|
"16 DR Congo 89561403 89561403\n",
|
|||
|
"17 Turkey 84339067 84339067\n",
|
|||
|
"18 Iran 83992949 83992949\n",
|
|||
|
"19 Germany 83783942 83783942\n",
|
|||
|
"20 Thailand 69799978 69799978\n",
|
|||
|
"21 United Kingdom 67886011 67886011\n",
|
|||
|
"22 France 65273511 65273511\n",
|
|||
|
"23 Italy 60461826 60461826\n",
|
|||
|
"24 Tanzania 59734218 59734218\n",
|
|||
|
"25 South Africa 59308690 59308690\n",
|
|||
|
"26 Myanmar 54409800 54409800\n",
|
|||
|
"27 Kenya 53771296 53771296\n",
|
|||
|
"28 South Korea 51269185 51269185\n",
|
|||
|
"29 Colombia 50882891 50882891"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from scipy.stats.mstats import winsorize\n",
|
|||
|
"\n",
|
|||
|
"print(countries_norm[\"Population 2020\"].quantile(q=0.95))\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
|
|||
|
" countries_norm[\"Population 2020\"].fillna(countries_norm[\"Population 2020\"].mean()),\n",
|
|||
|
" (0, 0.05),\n",
|
|||
|
" inplace=False,\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n",
|
|||
|
" [\"Country (or dependency)\", \"Population 2020\", \"PopulationWinsorized\"]\n",
|
|||
|
"]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Нормализация значений"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Country (or dependency)</th>\n",
|
|||
|
" <th>Population 2020</th>\n",
|
|||
|
" <th>PopulationNorm</th>\n",
|
|||
|
" <th>PopulationClipNorm</th>\n",
|
|||
|
" <th>PopulationWinsorizedNorm</th>\n",
|
|||
|
" <th>PopulationWinsorizedNorm2</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>China</td>\n",
|
|||
|
" <td>1439323776</td>\n",
|
|||
|
" <td>1.000000e+00</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>India</td>\n",
|
|||
|
" <td>1380004385</td>\n",
|
|||
|
" <td>9.587866e-01</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>331002651</td>\n",
|
|||
|
" <td>2.299705e-01</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Indonesia</td>\n",
|
|||
|
" <td>273523615</td>\n",
|
|||
|
" <td>1.900357e-01</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>Pakistan</td>\n",
|
|||
|
" <td>220892340</td>\n",
|
|||
|
" <td>1.534691e-01</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>231</th>\n",
|
|||
|
" <td>Montserrat</td>\n",
|
|||
|
" <td>4992</td>\n",
|
|||
|
" <td>2.911786e-06</td>\n",
|
|||
|
" <td>0.000084</td>\n",
|
|||
|
" <td>0.000036</td>\n",
|
|||
|
" <td>-0.999927</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>232</th>\n",
|
|||
|
" <td>Falkland Islands</td>\n",
|
|||
|
" <td>3480</td>\n",
|
|||
|
" <td>1.861292e-06</td>\n",
|
|||
|
" <td>0.000054</td>\n",
|
|||
|
" <td>0.000023</td>\n",
|
|||
|
" <td>-0.999953</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>233</th>\n",
|
|||
|
" <td>Niue</td>\n",
|
|||
|
" <td>1626</td>\n",
|
|||
|
" <td>5.731862e-07</td>\n",
|
|||
|
" <td>0.000017</td>\n",
|
|||
|
" <td>0.000007</td>\n",
|
|||
|
" <td>-0.999986</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>234</th>\n",
|
|||
|
" <td>Tokelau</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" <td>3.862927e-07</td>\n",
|
|||
|
" <td>0.000011</td>\n",
|
|||
|
" <td>0.000005</td>\n",
|
|||
|
" <td>-0.999990</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>235</th>\n",
|
|||
|
" <td>Holy See</td>\n",
|
|||
|
" <td>801</td>\n",
|
|||
|
" <td>0.000000e+00</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>-1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>235 rows × 6 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Country (or dependency) Population 2020 PopulationNorm \\\n",
|
|||
|
"no \n",
|
|||
|
"1 China 1439323776 1.000000e+00 \n",
|
|||
|
"2 India 1380004385 9.587866e-01 \n",
|
|||
|
"3 United States 331002651 2.299705e-01 \n",
|
|||
|
"4 Indonesia 273523615 1.900357e-01 \n",
|
|||
|
"5 Pakistan 220892340 1.534691e-01 \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"231 Montserrat 4992 2.911786e-06 \n",
|
|||
|
"232 Falkland Islands 3480 1.861292e-06 \n",
|
|||
|
"233 Niue 1626 5.731862e-07 \n",
|
|||
|
"234 Tokelau 1357 3.862927e-07 \n",
|
|||
|
"235 Holy See 801 0.000000e+00 \n",
|
|||
|
"\n",
|
|||
|
" PopulationClipNorm PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
|
|||
|
"no \n",
|
|||
|
"1 1.000000 1.000000 1.000000 \n",
|
|||
|
"2 1.000000 1.000000 1.000000 \n",
|
|||
|
"3 1.000000 1.000000 1.000000 \n",
|
|||
|
"4 1.000000 1.000000 1.000000 \n",
|
|||
|
"5 1.000000 1.000000 1.000000 \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"231 0.000084 0.000036 -0.999927 \n",
|
|||
|
"232 0.000054 0.000023 -0.999953 \n",
|
|||
|
"233 0.000017 0.000007 -0.999986 \n",
|
|||
|
"234 0.000011 0.000005 -0.999990 \n",
|
|||
|
"235 0.000000 0.000000 -1.000000 \n",
|
|||
|
"\n",
|
|||
|
"[235 rows x 6 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn import preprocessing\n",
|
|||
|
"\n",
|
|||
|
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
|||
|
"\n",
|
|||
|
"min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
|
|||
|
" countries_norm[\"Population 2020\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
|
|||
|
" countries_norm[\"Population Clip\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
|
|||
|
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
|
|||
|
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\n",
|
|||
|
" [\n",
|
|||
|
" \"Country (or dependency)\",\n",
|
|||
|
" \"Population 2020\",\n",
|
|||
|
" \"PopulationNorm\",\n",
|
|||
|
" \"PopulationClipNorm\",\n",
|
|||
|
" \"PopulationWinsorizedNorm\",\n",
|
|||
|
" \"PopulationWinsorizedNorm2\",\n",
|
|||
|
" ]\n",
|
|||
|
"]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Стандартизация значений"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Country (or dependency)</th>\n",
|
|||
|
" <th>Population 2020</th>\n",
|
|||
|
" <th>PopulationStand</th>\n",
|
|||
|
" <th>PopulationClipStand</th>\n",
|
|||
|
" <th>PopulationWinsorizedStand</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>no</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>China</td>\n",
|
|||
|
" <td>1439323776</td>\n",
|
|||
|
" <td>10.427597</td>\n",
|
|||
|
" <td>2.073933</td>\n",
|
|||
|
" <td>3.171659</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>India</td>\n",
|
|||
|
" <td>1380004385</td>\n",
|
|||
|
" <td>9.987702</td>\n",
|
|||
|
" <td>2.073933</td>\n",
|
|||
|
" <td>3.171659</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>331002651</td>\n",
|
|||
|
" <td>2.208627</td>\n",
|
|||
|
" <td>2.073933</td>\n",
|
|||
|
" <td>3.171659</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Indonesia</td>\n",
|
|||
|
" <td>273523615</td>\n",
|
|||
|
" <td>1.782380</td>\n",
|
|||
|
" <td>2.073933</td>\n",
|
|||
|
" <td>3.171659</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>Pakistan</td>\n",
|
|||
|
" <td>220892340</td>\n",
|
|||
|
" <td>1.392082</td>\n",
|
|||
|
" <td>2.073933</td>\n",
|
|||
|
" <td>3.171659</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>231</th>\n",
|
|||
|
" <td>Montserrat</td>\n",
|
|||
|
" <td>4992</td>\n",
|
|||
|
" <td>-0.245950</td>\n",
|
|||
|
" <td>-0.795071</td>\n",
|
|||
|
" <td>-0.621969</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>232</th>\n",
|
|||
|
" <td>Falkland Islands</td>\n",
|
|||
|
" <td>3480</td>\n",
|
|||
|
" <td>-0.245962</td>\n",
|
|||
|
" <td>-0.795158</td>\n",
|
|||
|
" <td>-0.622019</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>233</th>\n",
|
|||
|
" <td>Niue</td>\n",
|
|||
|
" <td>1626</td>\n",
|
|||
|
" <td>-0.245975</td>\n",
|
|||
|
" <td>-0.795265</td>\n",
|
|||
|
" <td>-0.622080</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>234</th>\n",
|
|||
|
" <td>Tokelau</td>\n",
|
|||
|
" <td>1357</td>\n",
|
|||
|
" <td>-0.245977</td>\n",
|
|||
|
" <td>-0.795280</td>\n",
|
|||
|
" <td>-0.622089</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>235</th>\n",
|
|||
|
" <td>Holy See</td>\n",
|
|||
|
" <td>801</td>\n",
|
|||
|
" <td>-0.245982</td>\n",
|
|||
|
" <td>-0.795312</td>\n",
|
|||
|
" <td>-0.622107</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>235 rows × 5 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Country (or dependency) Population 2020 PopulationStand \\\n",
|
|||
|
"no \n",
|
|||
|
"1 China 1439323776 10.427597 \n",
|
|||
|
"2 India 1380004385 9.987702 \n",
|
|||
|
"3 United States 331002651 2.208627 \n",
|
|||
|
"4 Indonesia 273523615 1.782380 \n",
|
|||
|
"5 Pakistan 220892340 1.392082 \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"231 Montserrat 4992 -0.245950 \n",
|
|||
|
"232 Falkland Islands 3480 -0.245962 \n",
|
|||
|
"233 Niue 1626 -0.245975 \n",
|
|||
|
"234 Tokelau 1357 -0.245977 \n",
|
|||
|
"235 Holy See 801 -0.245982 \n",
|
|||
|
"\n",
|
|||
|
" PopulationClipStand PopulationWinsorizedStand \n",
|
|||
|
"no \n",
|
|||
|
"1 2.073933 3.171659 \n",
|
|||
|
"2 2.073933 3.171659 \n",
|
|||
|
"3 2.073933 3.171659 \n",
|
|||
|
"4 2.073933 3.171659 \n",
|
|||
|
"5 2.073933 3.171659 \n",
|
|||
|
".. ... ... \n",
|
|||
|
"231 -0.795071 -0.621969 \n",
|
|||
|
"232 -0.795158 -0.622019 \n",
|
|||
|
"233 -0.795265 -0.622080 \n",
|
|||
|
"234 -0.795280 -0.622089 \n",
|
|||
|
"235 -0.795312 -0.622107 \n",
|
|||
|
"\n",
|
|||
|
"[235 rows x 5 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn import preprocessing\n",
|
|||
|
"\n",
|
|||
|
"stndart_scaler = preprocessing.StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
|
|||
|
" countries_norm[\"Population 2020\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
|
|||
|
" countries_norm[\"Population Clip\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
|
|||
|
" countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
|
|||
|
").reshape(countries_norm[\"Population 2020\"].shape)\n",
|
|||
|
"\n",
|
|||
|
"countries_norm[\n",
|
|||
|
" [\n",
|
|||
|
" \"Country (or dependency)\",\n",
|
|||
|
" \"Population 2020\",\n",
|
|||
|
" \"PopulationStand\",\n",
|
|||
|
" \"PopulationClipStand\",\n",
|
|||
|
" \"PopulationWinsorizedStand\",\n",
|
|||
|
" ]\n",
|
|||
|
"]"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|