diff --git a/data/countries-continents-capitals.csv b/data/countries-continents-capitals.csv
index b90f340..61258ec 100644
--- a/data/countries-continents-capitals.csv
+++ b/data/countries-continents-capitals.csv
@@ -1,4 +1,4 @@
-Country/Territory,Capital,Continent
+Country,Capital,Continent
Afghanistan,Kabul,Asia
Albania,Tirana,Europe
Algeria,Algiers,Africa
@@ -6,7 +6,7 @@ American Samoa,Pago Pago,Oceania
Andorra,Andorra la Vella,Europe
Angola,Luanda,Africa
Anguilla,The Valley,North America
-Antigua and Barbuda,Saint John??s,North America
+Antigua and Barbuda,Saint John�??s,North America
Argentina,Buenos Aires,South America
Armenia,Yerevan,Asia
Aruba,Oranjestad,North America
@@ -80,7 +80,7 @@ Greece,Athens,Europe
Greenland,Nuuk,North America
Grenada,Saint George's,North America
Guadeloupe,Basse-Terre,North America
-Guam,Hag?t?a,Oceania
+Guam,Hag?�t?�a,Oceania
Guatemala,Guatemala City,North America
Guernsey,Saint Peter Port,Europe
Guinea,Conakry,Africa
@@ -180,7 +180,7 @@ Saint Pierre and Miquelon,Saint-Pierre,North America
Saint Vincent and the Grenadines,Kingstown,North America
Samoa,Apia,Oceania
San Marino,San Marino,Europe
-Sao Tome and Principe,S?o Tom??,Africa
+Sao Tome and Principe,S?�o Tom??,Africa
Saudi Arabia,Riyadh,Asia
Senegal,Dakar,Africa
Serbia,Belgrade,Europe
@@ -209,7 +209,7 @@ Thailand,Bangkok,Asia
Timor-Leste,Dili,Asia
Togo,Lom??,Africa
Tokelau,Nukunonu,Oceania
-Tonga,Nuku??alofa,Oceania
+Tonga,Nuku�??alofa,Oceania
Trinidad and Tobago,Port-of-Spain,North America
Tunisia,Tunis,Africa
Turkey,Ankara,Asia
@@ -229,7 +229,7 @@ Vatican City,Vatican City,Europe
Venezuela,Caracas,South America
Vietnam,Hanoi,Asia
Wallis and Futuna,Mata-Utu,Oceania
-Western Sahara,El Aai?n,Africa
+Western Sahara,El Aai?�n,Africa
Yemen,Sanaa,Asia
Zambia,Lusaka,Africa
Zimbabwe,Harare,Africa
diff --git a/data/world-population-by-country-2020.csv b/data/world-population-by-country-2020.csv
index 78c1f46..d923824 100644
--- a/data/world-population-by-country-2020.csv
+++ b/data/world-population-by-country-2020.csv
@@ -1,4 +1,4 @@
-no,Country (or dependency),Population 2020,Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
+no,Country,Population2020,Yearly,NetChange,Density,LandArea,Migrants,FertRate,MedAge,UrbanPop,WorldShare
1,China,"1,439,323,776",0.39%,"5,540,090",153,"9,388,211","-348,399",1.7,38,61%,18.47%
2,India,"1,380,004,385",0.99%,"13,586,631",464,"2,973,190","-532,687",2.2,28,35%,17.70%
3,United States,"331,002,651",0.59%,"1,937,734",36,"9,147,420","954,806",1.8,38,83%,4.25%
diff --git a/data/world-population-forcast-2020-2050.csv b/data/world-population-forcast-2020-2050.csv
index 621e50e..a27901d 100644
--- a/data/world-population-forcast-2020-2050.csv
+++ b/data/world-population-forcast-2020-2050.csv
@@ -1,4 +1,4 @@
-Year,Population,Yearly %,Yearly,Median,Fertility,Density
+Year,Population,YearlyPer,Yearly,Median,Fertility,Density
2020,"7,794,798,739",1.10%,"83,000,320",31,2.47,52
2025,"8,184,437,460",0.98%,"77,927,744",32,2.54,55
2030,"8,548,487,400",0.87%,"72,809,988",33,2.62,57
diff --git a/lec3.ipynb b/lec3.ipynb
new file mode 100644
index 0000000..e1fc869
--- /dev/null
+++ b/lec3.ipynb
@@ -0,0 +1,3453 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Унитарное кодирование\n",
+ "\n",
+ "Преобразование категориального признака в несколько бинарных признаков"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Загрузка набора данных Titanic"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Population2020 | \n",
+ " Yearly | \n",
+ " NetChange | \n",
+ " Density | \n",
+ " LandArea | \n",
+ " Migrants | \n",
+ " FertRate | \n",
+ " MedAge | \n",
+ " UrbanPop | \n",
+ " WorldShare | \n",
+ " Net Change | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 0.39 | \n",
+ " 5,540,090 | \n",
+ " 153 | \n",
+ " 9388211 | \n",
+ " -348,399 | \n",
+ " 1.7 | \n",
+ " 38 | \n",
+ " 61% | \n",
+ " 18.47% | \n",
+ " 5540090 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 0.99 | \n",
+ " 13,586,631 | \n",
+ " 464 | \n",
+ " 2973190 | \n",
+ " -532,687 | \n",
+ " 2.2 | \n",
+ " 28 | \n",
+ " 35% | \n",
+ " 17.70% | \n",
+ " 13586631 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 0.59 | \n",
+ " 1,937,734 | \n",
+ " 36 | \n",
+ " 9147420 | \n",
+ " 954,806 | \n",
+ " 1.8 | \n",
+ " 38 | \n",
+ " 83% | \n",
+ " 4.25% | \n",
+ " 1937734 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 1.07 | \n",
+ " 2,898,047 | \n",
+ " 151 | \n",
+ " 1811570 | \n",
+ " -98,955 | \n",
+ " 2.3 | \n",
+ " 30 | \n",
+ " 56% | \n",
+ " 3.51% | \n",
+ " 2898047 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 2.00 | \n",
+ " 4,327,022 | \n",
+ " 287 | \n",
+ " 770880 | \n",
+ " -233,379 | \n",
+ " 3.6 | \n",
+ " 23 | \n",
+ " 35% | \n",
+ " 2.83% | \n",
+ " 4327022 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " Montserrat | \n",
+ " 4992 | \n",
+ " 0.06 | \n",
+ " 3 | \n",
+ " 50 | \n",
+ " 100 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 10% | \n",
+ " 0.00% | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 232 | \n",
+ " Falkland Islands | \n",
+ " 3480 | \n",
+ " 3.05 | \n",
+ " 103 | \n",
+ " 0 | \n",
+ " 12170 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 66% | \n",
+ " 0.00% | \n",
+ " 103 | \n",
+ "
\n",
+ " \n",
+ " 233 | \n",
+ " Niue | \n",
+ " 1626 | \n",
+ " 0.68 | \n",
+ " 11 | \n",
+ " 6 | \n",
+ " 260 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 46% | \n",
+ " 0.00% | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " 234 | \n",
+ " Tokelau | \n",
+ " 1357 | \n",
+ " 1.27 | \n",
+ " 17 | \n",
+ " 136 | \n",
+ " 10 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 0% | \n",
+ " 0.00% | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " 235 | \n",
+ " Holy See | \n",
+ " 801 | \n",
+ " 0.25 | \n",
+ " 2 | \n",
+ " 2,003 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 0.00% | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
235 rows × 12 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country Population2020 Yearly NetChange Density LandArea \\\n",
+ "no \n",
+ "1 China 1439323776 0.39 5,540,090 153 9388211 \n",
+ "2 India 1380004385 0.99 13,586,631 464 2973190 \n",
+ "3 United States 331002651 0.59 1,937,734 36 9147420 \n",
+ "4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n",
+ "5 Pakistan 220892340 2.00 4,327,022 287 770880 \n",
+ ".. ... ... ... ... ... ... \n",
+ "231 Montserrat 4992 0.06 3 50 100 \n",
+ "232 Falkland Islands 3480 3.05 103 0 12170 \n",
+ "233 Niue 1626 0.68 11 6 260 \n",
+ "234 Tokelau 1357 1.27 17 136 10 \n",
+ "235 Holy See 801 0.25 2 2,003 0 \n",
+ "\n",
+ " Migrants FertRate MedAge UrbanPop WorldShare Net Change \n",
+ "no \n",
+ "1 -348,399 1.7 38 61% 18.47% 5540090 \n",
+ "2 -532,687 2.2 28 35% 17.70% 13586631 \n",
+ "3 954,806 1.8 38 83% 4.25% 1937734 \n",
+ "4 -98,955 2.3 30 56% 3.51% 2898047 \n",
+ "5 -233,379 3.6 23 35% 2.83% 4327022 \n",
+ ".. ... ... ... ... ... ... \n",
+ "231 NaN N.A. N.A. 10% 0.00% 3 \n",
+ "232 NaN N.A. N.A. 66% 0.00% 103 \n",
+ "233 NaN N.A. N.A. 46% 0.00% 11 \n",
+ "234 NaN N.A. N.A. 0% 0.00% 17 \n",
+ "235 NaN N.A. N.A. N.A. 0.00% 2 \n",
+ "\n",
+ "[235 rows x 12 columns]"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "countries = pd.read_csv(\n",
+ " \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n",
+ ")\n",
+ "\n",
+ "countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n",
+ " lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
+ ")\n",
+ "countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "countries"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Кодирование"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "import numpy as np\n",
+ "\n",
+ "# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
+ "\n",
+ "# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n",
+ "\n",
+ "# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n",
+ "\n",
+ "# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
+ "\n",
+ "# encoded_values_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Добавление признаков в исходный Dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n",
+ "\n",
+ "# titanic"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Дискретизация признаков"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Равномерное разделение данных на 3 группы"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "labels = [\"Small\", \"Middle\", \"Big\"]\n",
+ "num_bins = 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0. , 5458956.66666667, 10917913.33333333,\n",
+ " 16376870. ]),\n",
+ " array([229, 5, 1]))"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "hist1, bins1 = np.histogram(\n",
+ " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n",
+ ")\n",
+ "bins1, hist1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " (5458956.667, 10917913.333] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " (5458956.667, 10917913.333] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " (5458956.667, 10917913.333] | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " (10917913.333, 16376870.0] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 (5458956.667, 10917913.333]\n",
+ "2 2973190 (0.0, 5458956.667]\n",
+ "3 9147420 (5458956.667, 10917913.333]\n",
+ "4 1811570 (0.0, 5458956.667]\n",
+ "5 770880 (0.0, 5458956.667]\n",
+ "6 8358140 (5458956.667, 10917913.333]\n",
+ "7 910770 (0.0, 5458956.667]\n",
+ "8 130170 (0.0, 5458956.667]\n",
+ "9 16376870 (10917913.333, 16376870.0]\n",
+ "10 1943950 (0.0, 5458956.667]\n",
+ "11 364555 (0.0, 5458956.667]\n",
+ "12 1000000 (0.0, 5458956.667]\n",
+ "13 298170 (0.0, 5458956.667]\n",
+ "14 995450 (0.0, 5458956.667]\n",
+ "15 310070 (0.0, 5458956.667]\n",
+ "16 2267050 (0.0, 5458956.667]\n",
+ "17 769630 (0.0, 5458956.667]\n",
+ "18 1628550 (0.0, 5458956.667]\n",
+ "19 348560 (0.0, 5458956.667]\n",
+ "20 510890 (0.0, 5458956.667]"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat(\n",
+ " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n",
+ ").head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 Middle\n",
+ "2 2973190 Small\n",
+ "3 9147420 Middle\n",
+ "4 1811570 Small\n",
+ "5 770880 Small\n",
+ "6 8358140 Middle\n",
+ "7 910770 Small\n",
+ "8 130170 Small\n",
+ "9 16376870 Big\n",
+ "10 1943950 Small\n",
+ "11 364555 Small\n",
+ "12 1000000 Small\n",
+ "13 298170 Small\n",
+ "14 995450 Small\n",
+ "15 310070 Small\n",
+ "16 2267050 Small\n",
+ "17 769630 Small\n",
+ "18 1628550 Small\n",
+ "19 348560 Small\n",
+ "20 510890 Small"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0., 4000000., 8000000., 12000000.]),\n",
+ " array([229, 1, 4, 1]))"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "labels = [\"Small\", \"Middle\", \"Big\"]\n",
+ "bins2 = np.linspace(0, 12000000, 4)\n",
+ "\n",
+ "tmp_bins2 = np.digitize(\n",
+ " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n",
+ ")\n",
+ "\n",
+ "hist2 = np.bincount(tmp_bins2 - 1)\n",
+ "\n",
+ "bins2, hist2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " (8000000.0, 12000000.0] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " (8000000.0, 12000000.0] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " (8000000.0, 12000000.0] | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 (8000000.0, 12000000.0]\n",
+ "2 2973190 (0.0, 4000000.0]\n",
+ "3 9147420 (8000000.0, 12000000.0]\n",
+ "4 1811570 (0.0, 4000000.0]\n",
+ "5 770880 (0.0, 4000000.0]\n",
+ "6 8358140 (8000000.0, 12000000.0]\n",
+ "7 910770 (0.0, 4000000.0]\n",
+ "8 130170 (0.0, 4000000.0]\n",
+ "9 16376870 NaN\n",
+ "10 1943950 (0.0, 4000000.0]\n",
+ "11 364555 (0.0, 4000000.0]\n",
+ "12 1000000 (0.0, 4000000.0]\n",
+ "13 298170 (0.0, 4000000.0]\n",
+ "14 995450 (0.0, 4000000.0]\n",
+ "15 310070 (0.0, 4000000.0]\n",
+ "16 2267050 (0.0, 4000000.0]\n",
+ "17 769630 (0.0, 4000000.0]\n",
+ "18 1628550 (0.0, 4000000.0]\n",
+ "19 348560 (0.0, 4000000.0]\n",
+ "20 510890 (0.0, 4000000.0]"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 Big\n",
+ "2 2973190 Small\n",
+ "3 9147420 Big\n",
+ "4 1811570 Small\n",
+ "5 770880 Small\n",
+ "6 8358140 Big\n",
+ "7 910770 Small\n",
+ "8 130170 Small\n",
+ "9 16376870 NaN\n",
+ "10 1943950 Small\n",
+ "11 364555 Small\n",
+ "12 1000000 Small\n",
+ "13 298170 Small\n",
+ "14 995450 Small\n",
+ "15 310070 Small\n",
+ "16 2267050 Small\n",
+ "17 769630 Small\n",
+ "18 1628550 Small\n",
+ "19 348560 Small\n",
+ "20 510890 Small"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat(\n",
+ " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n",
+ " axis=1,\n",
+ ").head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n",
+ " array([52, 77, 56, 44, 6]))"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
+ "hist3, bins3 = np.histogram(\n",
+ "\n",
+ " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n",
+ ")\n",
+ "\n",
+ "\n",
+ "bins3, hist3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 (3000000.0, inf]\n",
+ "2 2973190 (500000.0, 3000000.0]\n",
+ "3 9147420 (3000000.0, inf]\n",
+ "4 1811570 (500000.0, 3000000.0]\n",
+ "5 770880 (500000.0, 3000000.0]\n",
+ "6 8358140 (3000000.0, inf]\n",
+ "7 910770 (500000.0, 3000000.0]\n",
+ "8 130170 (100000.0, 500000.0]\n",
+ "9 16376870 (3000000.0, inf]\n",
+ "10 1943950 (500000.0, 3000000.0]\n",
+ "11 364555 (100000.0, 500000.0]\n",
+ "12 1000000 (500000.0, 3000000.0]\n",
+ "13 298170 (100000.0, 500000.0]\n",
+ "14 995450 (500000.0, 3000000.0]\n",
+ "15 310070 (100000.0, 500000.0]\n",
+ "16 2267050 (500000.0, 3000000.0]\n",
+ "17 769630 (500000.0, 3000000.0]\n",
+ "18 1628550 (500000.0, 3000000.0]\n",
+ "19 348560 (100000.0, 500000.0]\n",
+ "20 510890 (500000.0, 3000000.0]"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 Giant\n",
+ "2 2973190 Big\n",
+ "3 9147420 Giant\n",
+ "4 1811570 Big\n",
+ "5 770880 Big\n",
+ "6 8358140 Giant\n",
+ "7 910770 Big\n",
+ "8 130170 Middle\n",
+ "9 16376870 Giant\n",
+ "10 1943950 Big\n",
+ "11 364555 Middle\n",
+ "12 1000000 Big\n",
+ "13 298170 Middle\n",
+ "14 995450 Big\n",
+ "15 310070 Middle\n",
+ "16 2267050 Big\n",
+ "17 769630 Big\n",
+ "18 1628550 Big\n",
+ "19 348560 Middle\n",
+ "20 510890 Big"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat(\n",
+ " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n",
+ " axis=1,\n",
+ ").head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Квантильное разделение данных на 5 групп\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 4\n",
+ "2 2973190 4\n",
+ "3 9147420 4\n",
+ "4 1811570 4\n",
+ "5 770880 4\n",
+ "6 8358140 4\n",
+ "7 910770 4\n",
+ "8 130170 2\n",
+ "9 16376870 4\n",
+ "10 1943950 4\n",
+ "11 364555 3\n",
+ "12 1000000 4\n",
+ "13 298170 3\n",
+ "14 995450 4\n",
+ "15 310070 3\n",
+ "16 2267050 4\n",
+ "17 769630 4\n",
+ "18 1628550 4\n",
+ "19 348560 3\n",
+ "20 510890 3"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LandArea | \n",
+ " LandArea | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 9388211 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2973190 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9147420 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1811570 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 770880 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 8358140 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 910770 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 130170 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16376870 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 1943950 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 364555 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 1000000 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 298170 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 995450 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 310070 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 2267050 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 769630 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1628550 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 348560 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 510890 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LandArea LandArea\n",
+ "no \n",
+ "1 9388211 Giant\n",
+ "2 2973190 Giant\n",
+ "3 9147420 Giant\n",
+ "4 1811570 Giant\n",
+ "5 770880 Giant\n",
+ "6 8358140 Giant\n",
+ "7 910770 Giant\n",
+ "8 130170 Middle\n",
+ "9 16376870 Giant\n",
+ "10 1943950 Giant\n",
+ "11 364555 Big\n",
+ "12 1000000 Giant\n",
+ "13 298170 Big\n",
+ "14 995450 Giant\n",
+ "15 310070 Big\n",
+ "16 2267050 Giant\n",
+ "17 769630 Giant\n",
+ "18 1628550 Giant\n",
+ "19 348560 Big\n",
+ "20 510890 Big"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Пример конструирования признаков на основе существующих\n",
+ "\n",
+ "Title - обращение к пассажиру (Mr, Mrs, Miss)\n",
+ "\n",
+ "Is_married - замужняя ли женщина\n",
+ "\n",
+ "Cabin_type - палуба (тип каюты)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# titanic_cl = titanic.drop(\n",
+ "# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n",
+ "# )\n",
+ "# titanic_cl = titanic_cl.dropna()\n",
+ "\n",
+ "# titanic_cl[\"Title\"] = [\n",
+ "# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n",
+ "# ]\n",
+ "\n",
+ "# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n",
+ "\n",
+ "# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n",
+ "\n",
+ "# titanic_cl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n",
+ "\n",
+ "https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Загрузка данных\n",
+ "\n",
+ "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
+ "\n",
+ "Используется только 100 первых заказов и связанные с ними объекты\n",
+ "\n",
+ "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "( no Country Population2020 Yearly NetChange Density \\\n",
+ " 0 1 China 1439323776 0.39 5540090 153 \n",
+ " 1 2 India 1380004385 0.99 13586631 464 \n",
+ " 2 3 United States 331002651 0.59 1937734 36 \n",
+ " 3 4 Indonesia 273523615 1.07 2898047 151 \n",
+ " 4 5 Pakistan 220892340 2.00 4327022 287 \n",
+ " .. ... ... ... ... ... ... \n",
+ " 230 231 Montserrat 4992 0.06 3 50 \n",
+ " 231 232 Falkland Islands 3480 3.05 103 0 \n",
+ " 232 233 Niue 1626 0.68 11 6 \n",
+ " 233 234 Tokelau 1357 1.27 17 136 \n",
+ " 234 235 Holy See 801 0.25 2 2,003 \n",
+ " \n",
+ " LandArea \n",
+ " 0 9388211 \n",
+ " 1 2973190 \n",
+ " 2 9147420 \n",
+ " 3 1811570 \n",
+ " 4 770880 \n",
+ " .. ... \n",
+ " 230 100 \n",
+ " 231 12170 \n",
+ " 232 260 \n",
+ " 233 10 \n",
+ " 234 0 \n",
+ " \n",
+ " [235 rows x 7 columns],\n",
+ " Year Population YearlyPer Yearly Median Fertility Density\n",
+ " 0 2020 7794798739 1.10 83000320 31 2.47 52\n",
+ " 1 2025 8184437460 0.98 77927744 32 2.54 55\n",
+ " 2 2030 8548487400 0.87 72809988 33 2.62 57\n",
+ " 3 2035 8887524213 0.78 67807363 34 2.70 60\n",
+ " 4 2040 9198847240 0.69 62264605 35 2.77 62\n",
+ " 5 2045 9481803274 0.61 56591207 35 2.85 64\n",
+ " 6 2050 9735033990 0.53 50646143 36 2.95 65,\n",
+ " Country Capital Continent\n",
+ " 0 Afghanistan Kabul Asia\n",
+ " 1 Albania Tirana Europe\n",
+ " 2 Algeria Algiers Africa\n",
+ " 3 American Samoa Pago Pago Oceania\n",
+ " 4 Andorra Andorra la Vella Europe\n",
+ " .. ... ... ...\n",
+ " 229 Wallis and Futuna Mata-Utu Oceania\n",
+ " 230 Western Sahara El Aai?�n Africa\n",
+ " 231 Yemen Sanaa Asia\n",
+ " 232 Zambia Lusaka Africa\n",
+ " 233 Zimbabwe Harare Africa\n",
+ " \n",
+ " [234 rows x 3 columns])"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import featuretools as ft\n",
+ "from woodwork.logical_types import Categorical, Datetime\n",
+ "\n",
+ "info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n",
+ "forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n",
+ "capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n",
+ "forcast[\"Population\"] = forcast[\"Population\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n",
+ " lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
+ ")\n",
+ "forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n",
+ "info[\"Population2020\"] = info[\"Population2020\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "info[\"Yearly\"] = info[\"Yearly\"].apply(\n",
+ " lambda x: float(\"\".join(x.rstrip(\"%\")))\n",
+ ")\n",
+ "info[\"NetChange\"] = info[\"NetChange\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "info[\"LandArea\"] = info[\"LandArea\"].apply(\n",
+ " lambda x: int(\"\".join(x.split(\",\")))\n",
+ ")\n",
+ "\n",
+ "info, forcast, capitals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Создание сущностей в featuretools\n",
+ "\n",
+ "Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+ " pd.to_datetime(\n",
+ "c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+ " pd.to_datetime(\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Entityset: countries\n",
+ " DataFrames:\n",
+ " countries [Rows: 235, Columns: 7]\n",
+ " capitals [Rows: 234, Columns: 3]\n",
+ " forcast [Rows: 7, Columns: 8]\n",
+ " Relationships:\n",
+ " No relationships"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "es = ft.EntitySet(id=\"countries\")\n",
+ "\n",
+ "es = es.add_dataframe(\n",
+ " dataframe_name=\"countries\",\n",
+ " dataframe=info,\n",
+ " index=\"no\",\n",
+ " logical_types={\n",
+ " \"Country\": Categorical,\n",
+ " },\n",
+ ")\n",
+ "es = es.add_dataframe(\n",
+ " dataframe_name=\"capitals\",\n",
+ " dataframe=capitals,\n",
+ " index=\"Country\",\n",
+ " logical_types={\n",
+ " \"Country\": Categorical,\n",
+ " \"Capital\": Categorical,\n",
+ " \"Continent\": Categorical,\n",
+ " },\n",
+ ")\n",
+ "es = es.add_dataframe(\n",
+ " dataframe_name=\"forcast\",\n",
+ " dataframe=forcast,\n",
+ " index=\"forcast_id\",\n",
+ " make_index=True,\n",
+ " logical_types={\n",
+ " \"Year\": Datetime,\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "es"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Настройка связей между сущностями featuretools\n",
+ "\n",
+ "Настройка связей между таблицами на уровне ключей\n",
+ "\n",
+ "Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Entityset: countries\n",
+ " DataFrames:\n",
+ " countries [Rows: 235, Columns: 7]\n",
+ " capitals [Rows: 234, Columns: 3]\n",
+ " forcast [Rows: 7, Columns: 8]\n",
+ " Relationships:\n",
+ " countries.Country -> capitals.Country"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n",
+ "\n",
+ "es"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Автоматическое конструирование признаков с помощью featuretools\n",
+ "\n",
+ "Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n",
+ "\n",
+ "Результат помещается в Dataframe feature_matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Population2020 | \n",
+ " Yearly | \n",
+ " NetChange | \n",
+ " LandArea | \n",
+ " capitals.Capital | \n",
+ " capitals.Continent | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 0.39 | \n",
+ " 5540090 | \n",
+ " 9388211 | \n",
+ " Beijing | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 0.99 | \n",
+ " 13586631 | \n",
+ " 2973190 | \n",
+ " New Delhi | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 0.59 | \n",
+ " 1937734 | \n",
+ " 9147420 | \n",
+ " Washington, D.C. | \n",
+ " North America | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 1.07 | \n",
+ " 2898047 | \n",
+ " 1811570 | \n",
+ " Jakarta | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 2.00 | \n",
+ " 4327022 | \n",
+ " 770880 | \n",
+ " Islamabad | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " Montserrat | \n",
+ " 4992 | \n",
+ " 0.06 | \n",
+ " 3 | \n",
+ " 100 | \n",
+ " Brades | \n",
+ " North America | \n",
+ "
\n",
+ " \n",
+ " 232 | \n",
+ " Falkland Islands | \n",
+ " 3480 | \n",
+ " 3.05 | \n",
+ " 103 | \n",
+ " 12170 | \n",
+ " Stanley | \n",
+ " South America | \n",
+ "
\n",
+ " \n",
+ " 233 | \n",
+ " Niue | \n",
+ " 1626 | \n",
+ " 0.68 | \n",
+ " 11 | \n",
+ " 260 | \n",
+ " Alofi | \n",
+ " Oceania | \n",
+ "
\n",
+ " \n",
+ " 234 | \n",
+ " Tokelau | \n",
+ " 1357 | \n",
+ " 1.27 | \n",
+ " 17 | \n",
+ " 10 | \n",
+ " Nukunonu | \n",
+ " Oceania | \n",
+ "
\n",
+ " \n",
+ " 235 | \n",
+ " Holy See | \n",
+ " 801 | \n",
+ " 0.25 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
235 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country Population2020 Yearly NetChange LandArea \\\n",
+ "no \n",
+ "1 China 1439323776 0.39 5540090 9388211 \n",
+ "2 India 1380004385 0.99 13586631 2973190 \n",
+ "3 United States 331002651 0.59 1937734 9147420 \n",
+ "4 Indonesia 273523615 1.07 2898047 1811570 \n",
+ "5 Pakistan 220892340 2.00 4327022 770880 \n",
+ ".. ... ... ... ... ... \n",
+ "231 Montserrat 4992 0.06 3 100 \n",
+ "232 Falkland Islands 3480 3.05 103 12170 \n",
+ "233 Niue 1626 0.68 11 260 \n",
+ "234 Tokelau 1357 1.27 17 10 \n",
+ "235 Holy See 801 0.25 2 0 \n",
+ "\n",
+ " capitals.Capital capitals.Continent \n",
+ "no \n",
+ "1 Beijing Asia \n",
+ "2 New Delhi Asia \n",
+ "3 Washington, D.C. North America \n",
+ "4 Jakarta Asia \n",
+ "5 Islamabad Asia \n",
+ ".. ... ... \n",
+ "231 Brades North America \n",
+ "232 Stanley South America \n",
+ "233 Alofi Oceania \n",
+ "234 Nukunonu Oceania \n",
+ "235 NaN NaN \n",
+ "\n",
+ "[235 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_matrix, feature_defs = ft.dfs(\n",
+ " entityset=es,\n",
+ " target_dataframe_name=\"countries\",\n",
+ " max_depth=1,\n",
+ ")\n",
+ "\n",
+ "feature_matrix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Полученные признаки\n",
+ "\n",
+ "Список колонок полученного dataframe'а"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ,\n",
+ " ]"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_defs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Отсечение значений признаков"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Определение выбросов с помощью boxplot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "countries.boxplot(column=\"Population2020\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Отсечение данных для признака Возраст, значение которых больше 65 лет"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Population2020 | \n",
+ " PopulationClip | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 212559417 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 206139589 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 164689383 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 145934462 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 128932753 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 126476461 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 114963588 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 109581078 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 102334404 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 97338579 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 89561403 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 84339067 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 83992949 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 83783942 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 69799978 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " United Kingdom | \n",
+ " 67886011 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " France | \n",
+ " 65273511 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Italy | \n",
+ " 60461826 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Tanzania | \n",
+ " 59734218 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " South Africa | \n",
+ " 59308690 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Myanmar | \n",
+ " 54409800 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Kenya | \n",
+ " 53771296 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " South Korea | \n",
+ " 51269185 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Colombia | \n",
+ " 50882891 | \n",
+ " 50000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country Population2020 PopulationClip\n",
+ "no \n",
+ "1 China 1439323776 50000000\n",
+ "2 India 1380004385 50000000\n",
+ "3 United States 331002651 50000000\n",
+ "4 Indonesia 273523615 50000000\n",
+ "5 Pakistan 220892340 50000000\n",
+ "6 Brazil 212559417 50000000\n",
+ "7 Nigeria 206139589 50000000\n",
+ "8 Bangladesh 164689383 50000000\n",
+ "9 Russia 145934462 50000000\n",
+ "10 Mexico 128932753 50000000\n",
+ "11 Japan 126476461 50000000\n",
+ "12 Ethiopia 114963588 50000000\n",
+ "13 Philippines 109581078 50000000\n",
+ "14 Egypt 102334404 50000000\n",
+ "15 Vietnam 97338579 50000000\n",
+ "16 DR Congo 89561403 50000000\n",
+ "17 Turkey 84339067 50000000\n",
+ "18 Iran 83992949 50000000\n",
+ "19 Germany 83783942 50000000\n",
+ "20 Thailand 69799978 50000000\n",
+ "21 United Kingdom 67886011 50000000\n",
+ "22 France 65273511 50000000\n",
+ "23 Italy 60461826 50000000\n",
+ "24 Tanzania 59734218 50000000\n",
+ "25 South Africa 59308690 50000000\n",
+ "26 Myanmar 54409800 50000000\n",
+ "27 Kenya 53771296 50000000\n",
+ "28 South Korea 51269185 50000000\n",
+ "29 Colombia 50882891 50000000"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "countries_norm = countries.copy()\n",
+ "\n",
+ "countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000);\n",
+ "\n",
+ "countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
+ " [\"Country\", \"Population2020\", \"PopulationClip\"]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Винсоризация признака Возраст"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "111195830.99999991\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Population2020 | \n",
+ " PopulationWinsorized | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 212559417 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 206139589 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 164689383 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 145934462 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 128932753 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 126476461 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 114963588 | \n",
+ " 114963588 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 109581078 | \n",
+ " 109581078 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 102334404 | \n",
+ " 102334404 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 97338579 | \n",
+ " 97338579 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 89561403 | \n",
+ " 89561403 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 84339067 | \n",
+ " 84339067 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 83992949 | \n",
+ " 83992949 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 83783942 | \n",
+ " 83783942 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 69799978 | \n",
+ " 69799978 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " United Kingdom | \n",
+ " 67886011 | \n",
+ " 67886011 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " France | \n",
+ " 65273511 | \n",
+ " 65273511 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Italy | \n",
+ " 60461826 | \n",
+ " 60461826 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Tanzania | \n",
+ " 59734218 | \n",
+ " 59734218 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " South Africa | \n",
+ " 59308690 | \n",
+ " 59308690 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Myanmar | \n",
+ " 54409800 | \n",
+ " 54409800 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Kenya | \n",
+ " 53771296 | \n",
+ " 53771296 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " South Korea | \n",
+ " 51269185 | \n",
+ " 51269185 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Colombia | \n",
+ " 50882891 | \n",
+ " 50882891 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country Population2020 PopulationWinsorized\n",
+ "no \n",
+ "1 China 1439323776 114963588\n",
+ "2 India 1380004385 114963588\n",
+ "3 United States 331002651 114963588\n",
+ "4 Indonesia 273523615 114963588\n",
+ "5 Pakistan 220892340 114963588\n",
+ "6 Brazil 212559417 114963588\n",
+ "7 Nigeria 206139589 114963588\n",
+ "8 Bangladesh 164689383 114963588\n",
+ "9 Russia 145934462 114963588\n",
+ "10 Mexico 128932753 114963588\n",
+ "11 Japan 126476461 114963588\n",
+ "12 Ethiopia 114963588 114963588\n",
+ "13 Philippines 109581078 109581078\n",
+ "14 Egypt 102334404 102334404\n",
+ "15 Vietnam 97338579 97338579\n",
+ "16 DR Congo 89561403 89561403\n",
+ "17 Turkey 84339067 84339067\n",
+ "18 Iran 83992949 83992949\n",
+ "19 Germany 83783942 83783942\n",
+ "20 Thailand 69799978 69799978\n",
+ "21 United Kingdom 67886011 67886011\n",
+ "22 France 65273511 65273511\n",
+ "23 Italy 60461826 60461826\n",
+ "24 Tanzania 59734218 59734218\n",
+ "25 South Africa 59308690 59308690\n",
+ "26 Myanmar 54409800 54409800\n",
+ "27 Kenya 53771296 53771296\n",
+ "28 South Korea 51269185 51269185\n",
+ "29 Colombia 50882891 50882891"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from scipy.stats.mstats import winsorize\n",
+ "\n",
+ "print(countries_norm[\"Population2020\"].quantile(q=0.95))\n",
+ "\n",
+ "countries_norm[\"PopulationWinsorized\"] = winsorize(\n",
+ " countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n",
+ " (0, 0.05),\n",
+ " inplace=False,\n",
+ ")\n",
+ "\n",
+ "countries_norm[countries_norm[\"Population2020\"] > 50000000][\n",
+ " [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Нормализация значений"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Population2020 | \n",
+ " PopulationNorm | \n",
+ " PopulationClipNorm | \n",
+ " PopulationWinsorizedNorm | \n",
+ " PopulationWinsorizedNorm2 | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 9.587866e-01 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 2.299705e-01 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 1.900357e-01 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 1.534691e-01 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " Montserrat | \n",
+ " 4992 | \n",
+ " 2.911786e-06 | \n",
+ " 0.000084 | \n",
+ " 0.000036 | \n",
+ " -0.999927 | \n",
+ "
\n",
+ " \n",
+ " 232 | \n",
+ " Falkland Islands | \n",
+ " 3480 | \n",
+ " 1.861292e-06 | \n",
+ " 0.000054 | \n",
+ " 0.000023 | \n",
+ " -0.999953 | \n",
+ "
\n",
+ " \n",
+ " 233 | \n",
+ " Niue | \n",
+ " 1626 | \n",
+ " 5.731862e-07 | \n",
+ " 0.000017 | \n",
+ " 0.000007 | \n",
+ " -0.999986 | \n",
+ "
\n",
+ " \n",
+ " 234 | \n",
+ " Tokelau | \n",
+ " 1357 | \n",
+ " 3.862927e-07 | \n",
+ " 0.000011 | \n",
+ " 0.000005 | \n",
+ " -0.999990 | \n",
+ "
\n",
+ " \n",
+ " 235 | \n",
+ " Holy See | \n",
+ " 801 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " -1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
235 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country Population2020 PopulationNorm PopulationClipNorm \\\n",
+ "no \n",
+ "1 China 1439323776 1.000000e+00 1.000000 \n",
+ "2 India 1380004385 9.587866e-01 1.000000 \n",
+ "3 United States 331002651 2.299705e-01 1.000000 \n",
+ "4 Indonesia 273523615 1.900357e-01 1.000000 \n",
+ "5 Pakistan 220892340 1.534691e-01 1.000000 \n",
+ ".. ... ... ... ... \n",
+ "231 Montserrat 4992 2.911786e-06 0.000084 \n",
+ "232 Falkland Islands 3480 1.861292e-06 0.000054 \n",
+ "233 Niue 1626 5.731862e-07 0.000017 \n",
+ "234 Tokelau 1357 3.862927e-07 0.000011 \n",
+ "235 Holy See 801 0.000000e+00 0.000000 \n",
+ "\n",
+ " PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n",
+ "no \n",
+ "1 1.000000 1.000000 \n",
+ "2 1.000000 1.000000 \n",
+ "3 1.000000 1.000000 \n",
+ "4 1.000000 1.000000 \n",
+ "5 1.000000 1.000000 \n",
+ ".. ... ... \n",
+ "231 0.000036 -0.999927 \n",
+ "232 0.000023 -0.999953 \n",
+ "233 0.000007 -0.999986 \n",
+ "234 0.000005 -0.999990 \n",
+ "235 0.000000 -1.000000 \n",
+ "\n",
+ "[235 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn import preprocessing\n",
+ "\n",
+ "min_max_scaler = preprocessing.MinMaxScaler()\n",
+ "\n",
+ "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n",
+ "\n",
+ "countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n",
+ " countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n",
+ " countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n",
+ " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n",
+ " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\n",
+ " [\n",
+ " \"Country\",\n",
+ " \"Population2020\",\n",
+ " \"PopulationNorm\",\n",
+ " \"PopulationClipNorm\",\n",
+ " \"PopulationWinsorizedNorm\",\n",
+ " \"PopulationWinsorizedNorm2\",\n",
+ " ]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Стандартизация значений"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " Population2020 | \n",
+ " PopulationStand | \n",
+ " PopulationClipStand | \n",
+ " PopulationWinsorizedStand | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 10.427597 | \n",
+ " 2.073933 | \n",
+ " 3.171659 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 9.987702 | \n",
+ " 2.073933 | \n",
+ " 3.171659 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 2.208627 | \n",
+ " 2.073933 | \n",
+ " 3.171659 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 1.782380 | \n",
+ " 2.073933 | \n",
+ " 3.171659 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 1.392082 | \n",
+ " 2.073933 | \n",
+ " 3.171659 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " Montserrat | \n",
+ " 4992 | \n",
+ " -0.245950 | \n",
+ " -0.795071 | \n",
+ " -0.621969 | \n",
+ "
\n",
+ " \n",
+ " 232 | \n",
+ " Falkland Islands | \n",
+ " 3480 | \n",
+ " -0.245962 | \n",
+ " -0.795158 | \n",
+ " -0.622019 | \n",
+ "
\n",
+ " \n",
+ " 233 | \n",
+ " Niue | \n",
+ " 1626 | \n",
+ " -0.245975 | \n",
+ " -0.795265 | \n",
+ " -0.622080 | \n",
+ "
\n",
+ " \n",
+ " 234 | \n",
+ " Tokelau | \n",
+ " 1357 | \n",
+ " -0.245977 | \n",
+ " -0.795280 | \n",
+ " -0.622089 | \n",
+ "
\n",
+ " \n",
+ " 235 | \n",
+ " Holy See | \n",
+ " 801 | \n",
+ " -0.245982 | \n",
+ " -0.795312 | \n",
+ " -0.622107 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
235 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country Population2020 PopulationStand PopulationClipStand \\\n",
+ "no \n",
+ "1 China 1439323776 10.427597 2.073933 \n",
+ "2 India 1380004385 9.987702 2.073933 \n",
+ "3 United States 331002651 2.208627 2.073933 \n",
+ "4 Indonesia 273523615 1.782380 2.073933 \n",
+ "5 Pakistan 220892340 1.392082 2.073933 \n",
+ ".. ... ... ... ... \n",
+ "231 Montserrat 4992 -0.245950 -0.795071 \n",
+ "232 Falkland Islands 3480 -0.245962 -0.795158 \n",
+ "233 Niue 1626 -0.245975 -0.795265 \n",
+ "234 Tokelau 1357 -0.245977 -0.795280 \n",
+ "235 Holy See 801 -0.245982 -0.795312 \n",
+ "\n",
+ " PopulationWinsorizedStand \n",
+ "no \n",
+ "1 3.171659 \n",
+ "2 3.171659 \n",
+ "3 3.171659 \n",
+ "4 3.171659 \n",
+ "5 3.171659 \n",
+ ".. ... \n",
+ "231 -0.621969 \n",
+ "232 -0.622019 \n",
+ "233 -0.622080 \n",
+ "234 -0.622089 \n",
+ "235 -0.622107 \n",
+ "\n",
+ "[235 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn import preprocessing\n",
+ "\n",
+ "stndart_scaler = preprocessing.StandardScaler()\n",
+ "\n",
+ "countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n",
+ " countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n",
+ " countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n",
+ " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n",
+ ").reshape(countries_norm[\"Population2020\"].shape)\n",
+ "\n",
+ "countries_norm[\n",
+ " [\n",
+ " \"Country\",\n",
+ " \"Population2020\",\n",
+ " \"PopulationStand\",\n",
+ " \"PopulationClipStand\",\n",
+ " \"PopulationWinsorizedStand\",\n",
+ " ]\n",
+ "]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/poetry.lock b/poetry.lock
index 3899a43..2f5cecc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -467,6 +467,17 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
+[[package]]
+name = "cloudpickle"
+version = "3.1.0"
+description = "Pickler class to extend the standard pickle.Pickler functionality"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
+ {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
+]
+
[[package]]
name = "colorama"
version = "0.4.6"
@@ -675,6 +686,41 @@ files = [
[package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
+[[package]]
+name = "featuretools"
+version = "1.31.0"
+description = "a framework for automated feature engineering"
+optional = false
+python-versions = "<4,>=3.9"
+files = [
+ {file = "featuretools-1.31.0-py3-none-any.whl", hash = "sha256:87c94e9ae959c89acd83da96bd2583f3ef0f6daaa9639cbb6e46dbde2c742a18"},
+ {file = "featuretools-1.31.0.tar.gz", hash = "sha256:01bfb17fcc1715b4c3623c7bc94a8982122c4a0fa03350ed478601bb81f90155"},
+]
+
+[package.dependencies]
+cloudpickle = ">=1.5.0"
+holidays = ">=0.17"
+numpy = ">=1.25.0"
+packaging = ">=20.0"
+pandas = ">=2.0.0"
+psutil = ">=5.7.0"
+scipy = ">=1.10.0"
+tqdm = ">=4.66.3"
+woodwork = ">=0.28.0"
+
+[package.extras]
+autonormalize = ["autonormalize (>=2.0.1)"]
+complete = ["featuretools[dask,nlp,premium]"]
+dask = ["dask[dataframe] (>=2023.2.0)", "distributed (>=2023.2.0)"]
+dev = ["black[jupyter] (>=23.1.0)", "featuretools[dask,docs,test]", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)"]
+docs = ["Sphinx (==5.1.1)", "autonormalize (>=2.0.1)", "click (>=7.0.0)", "featuretools[dask,test]", "ipython (==8.4.0)", "jupyter (==1.0.0)", "jupyter-client (>=8.0.2)", "matplotlib (==3.7.2)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"]
+nlp = ["nlp-primitives (>=2.12.0)"]
+premium = ["premium-primitives (>=0.0.3)"]
+sklearn = ["featuretools-sklearn-transformer (>=1.0.0)"]
+sql = ["featuretools-sql (>=0.0.1)", "psycopg2-binary (>=2.9.3)"]
+test = ["boto3 (>=1.34.32)", "composeml (>=0.8.0)", "graphviz (>=0.8.4)", "moto[all] (>=5.0.0)", "pip (>=23.3.0)", "pyarrow (>=14.0.1)", "pympler (>=0.8)", "pytest (>=7.1.2)", "pytest-cov (>=3.0.0)", "pytest-timeout (>=2.1.0)", "pytest-xdist (>=2.5.0)", "smart-open (>=5.0.0)", "urllib3 (>=1.26.18)"]
+tsfresh = ["featuretools-tsfresh-primitives (>=1.0.0)"]
+
[[package]]
name = "flask"
version = "3.0.3"
@@ -833,6 +879,20 @@ files = [
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
+[[package]]
+name = "holidays"
+version = "0.59"
+description = "World Holidays Framework"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "holidays-0.59-py3-none-any.whl", hash = "sha256:4576ec7aaad7cd66463236c110bcbd533ac7e739e0e9d3cbeccf8107384a8a92"},
+ {file = "holidays-0.59.tar.gz", hash = "sha256:c5cd2e1c0c27a64217b10faf2e8fcc224f5bb64087b56b70c6aff21f6379e6e4"},
+]
+
+[package.dependencies]
+python-dateutil = "*"
+
[[package]]
name = "httpcore"
version = "1.0.5"
@@ -914,6 +974,25 @@ examples = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "seab
optional = ["keras (>=2.4.3)", "pandas (>=1.0.5)", "tensorflow (>=2.4.3)"]
tests = ["black (>=23.3.0)", "flake8 (>=3.8.2)", "keras (>=2.4.3)", "mypy (>=1.3.0)", "pandas (>=1.0.5)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "tensorflow (>=2.4.3)"]
+[[package]]
+name = "importlib-resources"
+version = "6.4.5"
+description = "Read resources from Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "importlib_resources-6.4.5-py3-none-any.whl", hash = "sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717"},
+ {file = "importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"]
+type = ["pytest-mypy"]
+
[[package]]
name = "ipykernel"
version = "6.29.5"
@@ -2708,6 +2787,11 @@ files = [
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
+ {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
+ {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
+ {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
+ {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
+ {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@@ -2939,6 +3023,26 @@ files = [
{file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"},
]
+[[package]]
+name = "tqdm"
+version = "4.66.5"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
+ {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
[[package]]
name = "traitlets"
version = "5.14.3"
@@ -3110,7 +3214,33 @@ files = [
{file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
]
+[[package]]
+name = "woodwork"
+version = "0.31.0"
+description = "a data typing library for machine learning"
+optional = false
+python-versions = "<4,>=3.9"
+files = [
+ {file = "woodwork-0.31.0-py3-none-any.whl", hash = "sha256:5cb3370553b5f466f8c8599b1bf559584dc0b798cc1f2da26bbd7029d256c6f9"},
+ {file = "woodwork-0.31.0.tar.gz", hash = "sha256:6ef82af1d5b6525b02efe6417c574c810cfdcc606cb266bd0d7fb17a1d066b67"},
+]
+
+[package.dependencies]
+importlib-resources = ">=5.10.0"
+numpy = ">=1.25.0"
+pandas = ">=2.0.0"
+python-dateutil = ">=2.8.2"
+scikit-learn = ">=1.1.0"
+scipy = ">=1.10.0"
+
+[package.extras]
+complete = ["woodwork[updater]"]
+dev = ["click (>=8.1.7)", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)", "woodwork[docs,test]"]
+docs = ["Sphinx (==5.1.1)", "ipython (==8.4.0)", "jupyter (==1.0.0)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pyarrow (>=14.0.1)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"]
+test = ["boto3 (>=1.34.32)", "moto[all] (>=5.0.0)", "pyarrow (>=14.0.1)", "pytest (>=7.0.1)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=2.1.0)", "smart-open (>=5.0.0)"]
+updater = ["alteryx-open-src-update-checker (>=3.1.0)"]
+
[metadata]
lock-version = "2.0"
python-versions = "^3.12"
-content-hash = "a7e3d516bde2d6e4173d8a9770fb5337a0c806dadaeda355084b262c1995f7ea"
+content-hash = "09433ce7624fd6af995c85e9e980c57cd417491975e280f0a844931df35e5085"
diff --git a/pyproject.toml b/pyproject.toml
index 0a91b71..ae221d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,12 @@ apiflask = "^2.2.0"
flask-cors = "^5.0.0"
scikit-learn = "^1.5.2"
imbalanced-learn = "^0.12.3"
+featuretools = "^1.31.0"
+[tool.poetry.group.dev.dependencies]
+ipykernel = "^6.29.5"
+
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"