{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Унитарное кодирование\n", "\n", "Преобразование категориального признака в несколько бинарных признаков" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Загрузка набора данных Titanic" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryPopulation2020YearlyNetChangeDensityLandAreaMigrantsFertRateMedAgeUrbanPopWorldShareNet Change
no
1China14393237760.395,540,0901539388211-348,3991.73861%18.47%5540090
2India13800043850.9913,586,6314642973190-532,6872.22835%17.70%13586631
3United States3310026510.591,937,734369147420954,8061.83883%4.25%1937734
4Indonesia2735236151.072,898,0471511811570-98,9552.33056%3.51%2898047
5Pakistan2208923402.004,327,022287770880-233,3793.62335%2.83%4327022
.......................................
231Montserrat49920.06350100NaNN.A.N.A.10%0.00%3
232Falkland Islands34803.05103012170NaNN.A.N.A.66%0.00%103
233Niue16260.68116260NaNN.A.N.A.46%0.00%11
234Tokelau13571.271713610NaNN.A.N.A.0%0.00%17
235Holy See8010.2522,0030NaNN.A.N.A.N.A.0.00%2
\n", "

235 rows × 12 columns

\n", "
" ], "text/plain": [ " Country Population2020 Yearly NetChange Density LandArea \\\n", "no \n", "1 China 1439323776 0.39 5,540,090 153 9388211 \n", "2 India 1380004385 0.99 13,586,631 464 2973190 \n", "3 United States 331002651 0.59 1,937,734 36 9147420 \n", "4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n", "5 Pakistan 220892340 2.00 4,327,022 287 770880 \n", ".. ... ... ... ... ... ... \n", "231 Montserrat 4992 0.06 3 50 100 \n", "232 Falkland Islands 3480 3.05 103 0 12170 \n", "233 Niue 1626 0.68 11 6 260 \n", "234 Tokelau 1357 1.27 17 136 10 \n", "235 Holy See 801 0.25 2 2,003 0 \n", "\n", " Migrants FertRate MedAge UrbanPop WorldShare Net Change \n", "no \n", "1 -348,399 1.7 38 61% 18.47% 5540090 \n", "2 -532,687 2.2 28 35% 17.70% 13586631 \n", "3 954,806 1.8 38 83% 4.25% 1937734 \n", "4 -98,955 2.3 30 56% 3.51% 2898047 \n", "5 -233,379 3.6 23 35% 2.83% 4327022 \n", ".. ... ... ... ... ... ... \n", "231 NaN N.A. N.A. 10% 0.00% 3 \n", "232 NaN N.A. N.A. 66% 0.00% 103 \n", "233 NaN N.A. N.A. 46% 0.00% 11 \n", "234 NaN N.A. N.A. 0% 0.00% 17 \n", "235 NaN N.A. N.A. N.A. 0.00% 2 \n", "\n", "[235 rows x 12 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "countries = pd.read_csv(\n", " \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n", ")\n", "\n", "countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n", " lambda x: float(\"\".join(x.rstrip(\"%\")))\n", ")\n", "countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "countries" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Кодирование" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "import numpy as np\n", "\n", "# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", "\n", "# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n", "\n", "# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n", "\n", "# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", "\n", "# encoded_values_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Добавление признаков в исходный Dataframe" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n", "\n", "# titanic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Дискретизация признаков" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Равномерное разделение данных на 3 группы" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "labels = [\"Small\", \"Middle\", \"Big\"]\n", "num_bins = 3" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 0. , 5458956.66666667, 10917913.33333333,\n", " 16376870. ]),\n", " array([229, 5, 1]))" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist1, bins1 = np.histogram(\n", " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n", ")\n", "bins1, hist1" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211(5458956.667, 10917913.333]
22973190(0.0, 5458956.667]
39147420(5458956.667, 10917913.333]
41811570(0.0, 5458956.667]
5770880(0.0, 5458956.667]
68358140(5458956.667, 10917913.333]
7910770(0.0, 5458956.667]
8130170(0.0, 5458956.667]
916376870(10917913.333, 16376870.0]
101943950(0.0, 5458956.667]
11364555(0.0, 5458956.667]
121000000(0.0, 5458956.667]
13298170(0.0, 5458956.667]
14995450(0.0, 5458956.667]
15310070(0.0, 5458956.667]
162267050(0.0, 5458956.667]
17769630(0.0, 5458956.667]
181628550(0.0, 5458956.667]
19348560(0.0, 5458956.667]
20510890(0.0, 5458956.667]
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 (5458956.667, 10917913.333]\n", "2 2973190 (0.0, 5458956.667]\n", "3 9147420 (5458956.667, 10917913.333]\n", "4 1811570 (0.0, 5458956.667]\n", "5 770880 (0.0, 5458956.667]\n", "6 8358140 (5458956.667, 10917913.333]\n", "7 910770 (0.0, 5458956.667]\n", "8 130170 (0.0, 5458956.667]\n", "9 16376870 (10917913.333, 16376870.0]\n", "10 1943950 (0.0, 5458956.667]\n", "11 364555 (0.0, 5458956.667]\n", "12 1000000 (0.0, 5458956.667]\n", "13 298170 (0.0, 5458956.667]\n", "14 995450 (0.0, 5458956.667]\n", "15 310070 (0.0, 5458956.667]\n", "16 2267050 (0.0, 5458956.667]\n", "17 769630 (0.0, 5458956.667]\n", "18 1628550 (0.0, 5458956.667]\n", "19 348560 (0.0, 5458956.667]\n", "20 510890 (0.0, 5458956.667]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n", ").head(20)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211Middle
22973190Small
39147420Middle
41811570Small
5770880Small
68358140Middle
7910770Small
8130170Small
916376870Big
101943950Small
11364555Small
121000000Small
13298170Small
14995450Small
15310070Small
162267050Small
17769630Small
181628550Small
19348560Small
20510890Small
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 Middle\n", "2 2973190 Small\n", "3 9147420 Middle\n", "4 1811570 Small\n", "5 770880 Small\n", "6 8358140 Middle\n", "7 910770 Small\n", "8 130170 Small\n", "9 16376870 Big\n", "10 1943950 Small\n", "11 364555 Small\n", "12 1000000 Small\n", "13 298170 Small\n", "14 995450 Small\n", "15 310070 Small\n", "16 2267050 Small\n", "17 769630 Small\n", "18 1628550 Small\n", "19 348560 Small\n", "20 510890 Small" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 0., 4000000., 8000000., 12000000.]),\n", " array([229, 1, 4, 1]))" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels = [\"Small\", \"Middle\", \"Big\"]\n", "bins2 = np.linspace(0, 12000000, 4)\n", "\n", "tmp_bins2 = np.digitize(\n", " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n", ")\n", "\n", "hist2 = np.bincount(tmp_bins2 - 1)\n", "\n", "bins2, hist2" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211(8000000.0, 12000000.0]
22973190(0.0, 4000000.0]
39147420(8000000.0, 12000000.0]
41811570(0.0, 4000000.0]
5770880(0.0, 4000000.0]
68358140(8000000.0, 12000000.0]
7910770(0.0, 4000000.0]
8130170(0.0, 4000000.0]
916376870NaN
101943950(0.0, 4000000.0]
11364555(0.0, 4000000.0]
121000000(0.0, 4000000.0]
13298170(0.0, 4000000.0]
14995450(0.0, 4000000.0]
15310070(0.0, 4000000.0]
162267050(0.0, 4000000.0]
17769630(0.0, 4000000.0]
181628550(0.0, 4000000.0]
19348560(0.0, 4000000.0]
20510890(0.0, 4000000.0]
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 (8000000.0, 12000000.0]\n", "2 2973190 (0.0, 4000000.0]\n", "3 9147420 (8000000.0, 12000000.0]\n", "4 1811570 (0.0, 4000000.0]\n", "5 770880 (0.0, 4000000.0]\n", "6 8358140 (8000000.0, 12000000.0]\n", "7 910770 (0.0, 4000000.0]\n", "8 130170 (0.0, 4000000.0]\n", "9 16376870 NaN\n", "10 1943950 (0.0, 4000000.0]\n", "11 364555 (0.0, 4000000.0]\n", "12 1000000 (0.0, 4000000.0]\n", "13 298170 (0.0, 4000000.0]\n", "14 995450 (0.0, 4000000.0]\n", "15 310070 (0.0, 4000000.0]\n", "16 2267050 (0.0, 4000000.0]\n", "17 769630 (0.0, 4000000.0]\n", "18 1628550 (0.0, 4000000.0]\n", "19 348560 (0.0, 4000000.0]\n", "20 510890 (0.0, 4000000.0]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211Big
22973190Small
39147420Big
41811570Small
5770880Small
68358140Big
7910770Small
8130170Small
916376870NaN
101943950Small
11364555Small
121000000Small
13298170Small
14995450Small
15310070Small
162267050Small
17769630Small
181628550Small
19348560Small
20510890Small
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 Big\n", "2 2973190 Small\n", "3 9147420 Big\n", "4 1811570 Small\n", "5 770880 Small\n", "6 8358140 Big\n", "7 910770 Small\n", "8 130170 Small\n", "9 16376870 NaN\n", "10 1943950 Small\n", "11 364555 Small\n", "12 1000000 Small\n", "13 298170 Small\n", "14 995450 Small\n", "15 310070 Small\n", "16 2267050 Small\n", "17 769630 Small\n", "18 1628550 Small\n", "19 348560 Small\n", "20 510890 Small" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n", " axis=1,\n", ").head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n", " array([52, 77, 56, 44, 6]))" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n", "hist3, bins3 = np.histogram(\n", "\n", " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n", ")\n", "\n", "\n", "bins3, hist3" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211(3000000.0, inf]
22973190(500000.0, 3000000.0]
39147420(3000000.0, inf]
41811570(500000.0, 3000000.0]
5770880(500000.0, 3000000.0]
68358140(3000000.0, inf]
7910770(500000.0, 3000000.0]
8130170(100000.0, 500000.0]
916376870(3000000.0, inf]
101943950(500000.0, 3000000.0]
11364555(100000.0, 500000.0]
121000000(500000.0, 3000000.0]
13298170(100000.0, 500000.0]
14995450(500000.0, 3000000.0]
15310070(100000.0, 500000.0]
162267050(500000.0, 3000000.0]
17769630(500000.0, 3000000.0]
181628550(500000.0, 3000000.0]
19348560(100000.0, 500000.0]
20510890(500000.0, 3000000.0]
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 (3000000.0, inf]\n", "2 2973190 (500000.0, 3000000.0]\n", "3 9147420 (3000000.0, inf]\n", "4 1811570 (500000.0, 3000000.0]\n", "5 770880 (500000.0, 3000000.0]\n", "6 8358140 (3000000.0, inf]\n", "7 910770 (500000.0, 3000000.0]\n", "8 130170 (100000.0, 500000.0]\n", "9 16376870 (3000000.0, inf]\n", "10 1943950 (500000.0, 3000000.0]\n", "11 364555 (100000.0, 500000.0]\n", "12 1000000 (500000.0, 3000000.0]\n", "13 298170 (100000.0, 500000.0]\n", "14 995450 (500000.0, 3000000.0]\n", "15 310070 (100000.0, 500000.0]\n", "16 2267050 (500000.0, 3000000.0]\n", "17 769630 (500000.0, 3000000.0]\n", "18 1628550 (500000.0, 3000000.0]\n", "19 348560 (100000.0, 500000.0]\n", "20 510890 (500000.0, 3000000.0]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211Giant
22973190Big
39147420Giant
41811570Big
5770880Big
68358140Giant
7910770Big
8130170Middle
916376870Giant
101943950Big
11364555Middle
121000000Big
13298170Middle
14995450Big
15310070Middle
162267050Big
17769630Big
181628550Big
19348560Middle
20510890Big
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 Giant\n", "2 2973190 Big\n", "3 9147420 Giant\n", "4 1811570 Big\n", "5 770880 Big\n", "6 8358140 Giant\n", "7 910770 Big\n", "8 130170 Middle\n", "9 16376870 Giant\n", "10 1943950 Big\n", "11 364555 Middle\n", "12 1000000 Big\n", "13 298170 Middle\n", "14 995450 Big\n", "15 310070 Middle\n", "16 2267050 Big\n", "17 769630 Big\n", "18 1628550 Big\n", "19 348560 Middle\n", "20 510890 Big" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n", " axis=1,\n", ").head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Квантильное разделение данных на 5 групп\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
193882114
229731904
391474204
418115704
57708804
683581404
79107704
81301702
9163768704
1019439504
113645553
1210000004
132981703
149954504
153100703
1622670504
177696304
1816285504
193485603
205108903
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 4\n", "2 2973190 4\n", "3 9147420 4\n", "4 1811570 4\n", "5 770880 4\n", "6 8358140 4\n", "7 910770 4\n", "8 130170 2\n", "9 16376870 4\n", "10 1943950 4\n", "11 364555 3\n", "12 1000000 4\n", "13 298170 3\n", "14 995450 4\n", "15 310070 3\n", "16 2267050 4\n", "17 769630 4\n", "18 1628550 4\n", "19 348560 3\n", "20 510890 3" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LandAreaLandArea
no
19388211Giant
22973190Giant
39147420Giant
41811570Giant
5770880Giant
68358140Giant
7910770Giant
8130170Middle
916376870Giant
101943950Giant
11364555Big
121000000Giant
13298170Big
14995450Giant
15310070Big
162267050Giant
17769630Giant
181628550Giant
19348560Big
20510890Big
\n", "
" ], "text/plain": [ " LandArea LandArea\n", "no \n", "1 9388211 Giant\n", "2 2973190 Giant\n", "3 9147420 Giant\n", "4 1811570 Giant\n", "5 770880 Giant\n", "6 8358140 Giant\n", "7 910770 Giant\n", "8 130170 Middle\n", "9 16376870 Giant\n", "10 1943950 Giant\n", "11 364555 Big\n", "12 1000000 Giant\n", "13 298170 Big\n", "14 995450 Giant\n", "15 310070 Big\n", "16 2267050 Giant\n", "17 769630 Giant\n", "18 1628550 Giant\n", "19 348560 Big\n", "20 510890 Big" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Пример конструирования признаков на основе существующих\n", "\n", "Title - обращение к пассажиру (Mr, Mrs, Miss)\n", "\n", "Is_married - замужняя ли женщина\n", "\n", "Cabin_type - палуба (тип каюты)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# titanic_cl = titanic.drop(\n", "# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n", "# )\n", "# titanic_cl = titanic_cl.dropna()\n", "\n", "# titanic_cl[\"Title\"] = [\n", "# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n", "# ]\n", "\n", "# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n", "\n", "# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n", "\n", "# titanic_cl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n", "\n", "https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Загрузка данных\n", "\n", "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n", "\n", "Используется только 100 первых заказов и связанные с ними объекты\n", "\n", "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "( no Country Population2020 Yearly NetChange Density \\\n", " 0 1 China 1439323776 0.39 5540090 153 \n", " 1 2 India 1380004385 0.99 13586631 464 \n", " 2 3 United States 331002651 0.59 1937734 36 \n", " 3 4 Indonesia 273523615 1.07 2898047 151 \n", " 4 5 Pakistan 220892340 2.00 4327022 287 \n", " .. ... ... ... ... ... ... \n", " 230 231 Montserrat 4992 0.06 3 50 \n", " 231 232 Falkland Islands 3480 3.05 103 0 \n", " 232 233 Niue 1626 0.68 11 6 \n", " 233 234 Tokelau 1357 1.27 17 136 \n", " 234 235 Holy See 801 0.25 2 2,003 \n", " \n", " LandArea \n", " 0 9388211 \n", " 1 2973190 \n", " 2 9147420 \n", " 3 1811570 \n", " 4 770880 \n", " .. ... \n", " 230 100 \n", " 231 12170 \n", " 232 260 \n", " 233 10 \n", " 234 0 \n", " \n", " [235 rows x 7 columns],\n", " Year Population YearlyPer Yearly Median Fertility Density\n", " 0 2020 7794798739 1.10 83000320 31 2.47 52\n", " 1 2025 8184437460 0.98 77927744 32 2.54 55\n", " 2 2030 8548487400 0.87 72809988 33 2.62 57\n", " 3 2035 8887524213 0.78 67807363 34 2.70 60\n", " 4 2040 9198847240 0.69 62264605 35 2.77 62\n", " 5 2045 9481803274 0.61 56591207 35 2.85 64\n", " 6 2050 9735033990 0.53 50646143 36 2.95 65,\n", " Country Capital Continent\n", " 0 Afghanistan Kabul Asia\n", " 1 Albania Tirana Europe\n", " 2 Algeria Algiers Africa\n", " 3 American Samoa Pago Pago Oceania\n", " 4 Andorra Andorra la Vella Europe\n", " .. ... ... ...\n", " 229 Wallis and Futuna Mata-Utu Oceania\n", " 230 Western Sahara El Aai?�n Africa\n", " 231 Yemen Sanaa Asia\n", " 232 Zambia Lusaka Africa\n", " 233 Zimbabwe Harare Africa\n", " \n", " [234 rows x 3 columns])" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import featuretools as ft\n", "from woodwork.logical_types import Categorical, Datetime\n", "\n", "info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n", "forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n", "capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n", "forcast[\"Population\"] = forcast[\"Population\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n", " lambda x: float(\"\".join(x.rstrip(\"%\")))\n", ")\n", "forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n", "info[\"Population2020\"] = info[\"Population2020\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "info[\"Yearly\"] = info[\"Yearly\"].apply(\n", " lambda x: float(\"\".join(x.rstrip(\"%\")))\n", ")\n", "info[\"NetChange\"] = info[\"NetChange\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "info[\"LandArea\"] = info[\"LandArea\"].apply(\n", " lambda x: int(\"\".join(x.split(\",\")))\n", ")\n", "\n", "info, forcast, capitals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Создание сущностей в featuretools\n", "\n", "Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n" ] }, { "data": { "text/plain": [ "Entityset: countries\n", " DataFrames:\n", " countries [Rows: 235, Columns: 7]\n", " capitals [Rows: 234, Columns: 3]\n", " forcast [Rows: 7, Columns: 8]\n", " Relationships:\n", " No relationships" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "es = ft.EntitySet(id=\"countries\")\n", "\n", "es = es.add_dataframe(\n", " dataframe_name=\"countries\",\n", " dataframe=info,\n", " index=\"no\",\n", " logical_types={\n", " \"Country\": Categorical,\n", " },\n", ")\n", "es = es.add_dataframe(\n", " dataframe_name=\"capitals\",\n", " dataframe=capitals,\n", " index=\"Country\",\n", " logical_types={\n", " \"Country\": Categorical,\n", " \"Capital\": Categorical,\n", " \"Continent\": Categorical,\n", " },\n", ")\n", "es = es.add_dataframe(\n", " dataframe_name=\"forcast\",\n", " dataframe=forcast,\n", " index=\"forcast_id\",\n", " make_index=True,\n", " logical_types={\n", " \"Year\": Datetime,\n", " },\n", ")\n", "\n", "es" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Настройка связей между сущностями featuretools\n", "\n", "Настройка связей между таблицами на уровне ключей\n", "\n", "Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Entityset: countries\n", " DataFrames:\n", " countries [Rows: 235, Columns: 7]\n", " capitals [Rows: 234, Columns: 3]\n", " forcast [Rows: 7, Columns: 8]\n", " Relationships:\n", " countries.Country -> capitals.Country" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n", "\n", "es" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Автоматическое конструирование признаков с помощью featuretools\n", "\n", "Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n", "\n", "Результат помещается в Dataframe feature_matrix" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryPopulation2020YearlyNetChangeLandAreacapitals.Capitalcapitals.Continent
no
1China14393237760.3955400909388211BeijingAsia
2India13800043850.99135866312973190New DelhiAsia
3United States3310026510.5919377349147420Washington, D.C.North America
4Indonesia2735236151.0728980471811570JakartaAsia
5Pakistan2208923402.004327022770880IslamabadAsia
........................
231Montserrat49920.063100BradesNorth America
232Falkland Islands34803.0510312170StanleySouth America
233Niue16260.6811260AlofiOceania
234Tokelau13571.271710NukunonuOceania
235Holy See8010.2520NaNNaN
\n", "

235 rows × 7 columns

\n", "
" ], "text/plain": [ " Country Population2020 Yearly NetChange LandArea \\\n", "no \n", "1 China 1439323776 0.39 5540090 9388211 \n", "2 India 1380004385 0.99 13586631 2973190 \n", "3 United States 331002651 0.59 1937734 9147420 \n", "4 Indonesia 273523615 1.07 2898047 1811570 \n", "5 Pakistan 220892340 2.00 4327022 770880 \n", ".. ... ... ... ... ... \n", "231 Montserrat 4992 0.06 3 100 \n", "232 Falkland Islands 3480 3.05 103 12170 \n", "233 Niue 1626 0.68 11 260 \n", "234 Tokelau 1357 1.27 17 10 \n", "235 Holy See 801 0.25 2 0 \n", "\n", " capitals.Capital capitals.Continent \n", "no \n", "1 Beijing Asia \n", "2 New Delhi Asia \n", "3 Washington, D.C. North America \n", "4 Jakarta Asia \n", "5 Islamabad Asia \n", ".. ... ... \n", "231 Brades North America \n", "232 Stanley South America \n", "233 Alofi Oceania \n", "234 Nukunonu Oceania \n", "235 NaN NaN \n", "\n", "[235 rows x 7 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_matrix, feature_defs = ft.dfs(\n", " entityset=es,\n", " target_dataframe_name=\"countries\",\n", " max_depth=1,\n", ")\n", "\n", "feature_matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Полученные признаки\n", "\n", "Список колонок полученного dataframe'а" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_defs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Отсечение значений признаков" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Определение выбросов с помощью boxplot" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.boxplot(column=\"Population2020\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Отсечение данных для признака Возраст, значение которых больше 65 лет" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryPopulation2020PopulationClip
no
1China143932377650000000
2India138000438550000000
3United States33100265150000000
4Indonesia27352361550000000
5Pakistan22089234050000000
6Brazil21255941750000000
7Nigeria20613958950000000
8Bangladesh16468938350000000
9Russia14593446250000000
10Mexico12893275350000000
11Japan12647646150000000
12Ethiopia11496358850000000
13Philippines10958107850000000
14Egypt10233440450000000
15Vietnam9733857950000000
16DR Congo8956140350000000
17Turkey8433906750000000
18Iran8399294950000000
19Germany8378394250000000
20Thailand6979997850000000
21United Kingdom6788601150000000
22France6527351150000000
23Italy6046182650000000
24Tanzania5973421850000000
25South Africa5930869050000000
26Myanmar5440980050000000
27Kenya5377129650000000
28South Korea5126918550000000
29Colombia5088289150000000
\n", "
" ], "text/plain": [ " Country Population2020 PopulationClip\n", "no \n", "1 China 1439323776 50000000\n", "2 India 1380004385 50000000\n", "3 United States 331002651 50000000\n", "4 Indonesia 273523615 50000000\n", "5 Pakistan 220892340 50000000\n", "6 Brazil 212559417 50000000\n", "7 Nigeria 206139589 50000000\n", "8 Bangladesh 164689383 50000000\n", "9 Russia 145934462 50000000\n", "10 Mexico 128932753 50000000\n", "11 Japan 126476461 50000000\n", "12 Ethiopia 114963588 50000000\n", "13 Philippines 109581078 50000000\n", "14 Egypt 102334404 50000000\n", "15 Vietnam 97338579 50000000\n", "16 DR Congo 89561403 50000000\n", "17 Turkey 84339067 50000000\n", "18 Iran 83992949 50000000\n", "19 Germany 83783942 50000000\n", "20 Thailand 69799978 50000000\n", "21 United Kingdom 67886011 50000000\n", "22 France 65273511 50000000\n", "23 Italy 60461826 50000000\n", "24 Tanzania 59734218 50000000\n", "25 South Africa 59308690 50000000\n", "26 Myanmar 54409800 50000000\n", "27 Kenya 53771296 50000000\n", "28 South Korea 51269185 50000000\n", "29 Colombia 50882891 50000000" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries_norm = countries.copy()\n", "\n", "countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000);\n", "\n", "countries_norm[countries_norm[\"Population2020\"] > 50000000][\n", " [\"Country\", \"Population2020\", \"PopulationClip\"]\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Винсоризация признака Возраст" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "111195830.99999991\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryPopulation2020PopulationWinsorized
no
1China1439323776114963588
2India1380004385114963588
3United States331002651114963588
4Indonesia273523615114963588
5Pakistan220892340114963588
6Brazil212559417114963588
7Nigeria206139589114963588
8Bangladesh164689383114963588
9Russia145934462114963588
10Mexico128932753114963588
11Japan126476461114963588
12Ethiopia114963588114963588
13Philippines109581078109581078
14Egypt102334404102334404
15Vietnam9733857997338579
16DR Congo8956140389561403
17Turkey8433906784339067
18Iran8399294983992949
19Germany8378394283783942
20Thailand6979997869799978
21United Kingdom6788601167886011
22France6527351165273511
23Italy6046182660461826
24Tanzania5973421859734218
25South Africa5930869059308690
26Myanmar5440980054409800
27Kenya5377129653771296
28South Korea5126918551269185
29Colombia5088289150882891
\n", "
" ], "text/plain": [ " Country Population2020 PopulationWinsorized\n", "no \n", "1 China 1439323776 114963588\n", "2 India 1380004385 114963588\n", "3 United States 331002651 114963588\n", "4 Indonesia 273523615 114963588\n", "5 Pakistan 220892340 114963588\n", "6 Brazil 212559417 114963588\n", "7 Nigeria 206139589 114963588\n", "8 Bangladesh 164689383 114963588\n", "9 Russia 145934462 114963588\n", "10 Mexico 128932753 114963588\n", "11 Japan 126476461 114963588\n", "12 Ethiopia 114963588 114963588\n", "13 Philippines 109581078 109581078\n", "14 Egypt 102334404 102334404\n", "15 Vietnam 97338579 97338579\n", "16 DR Congo 89561403 89561403\n", "17 Turkey 84339067 84339067\n", "18 Iran 83992949 83992949\n", "19 Germany 83783942 83783942\n", "20 Thailand 69799978 69799978\n", "21 United Kingdom 67886011 67886011\n", "22 France 65273511 65273511\n", "23 Italy 60461826 60461826\n", "24 Tanzania 59734218 59734218\n", "25 South Africa 59308690 59308690\n", "26 Myanmar 54409800 54409800\n", "27 Kenya 53771296 53771296\n", "28 South Korea 51269185 51269185\n", "29 Colombia 50882891 50882891" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.stats.mstats import winsorize\n", "\n", "print(countries_norm[\"Population2020\"].quantile(q=0.95))\n", "\n", "countries_norm[\"PopulationWinsorized\"] = winsorize(\n", " countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n", " (0, 0.05),\n", " inplace=False,\n", ")\n", "\n", "countries_norm[countries_norm[\"Population2020\"] > 50000000][\n", " [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Нормализация значений" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryPopulation2020PopulationNormPopulationClipNormPopulationWinsorizedNormPopulationWinsorizedNorm2
no
1China14393237761.000000e+001.0000001.0000001.000000
2India13800043859.587866e-011.0000001.0000001.000000
3United States3310026512.299705e-011.0000001.0000001.000000
4Indonesia2735236151.900357e-011.0000001.0000001.000000
5Pakistan2208923401.534691e-011.0000001.0000001.000000
.....................
231Montserrat49922.911786e-060.0000840.000036-0.999927
232Falkland Islands34801.861292e-060.0000540.000023-0.999953
233Niue16265.731862e-070.0000170.000007-0.999986
234Tokelau13573.862927e-070.0000110.000005-0.999990
235Holy See8010.000000e+000.0000000.000000-1.000000
\n", "

235 rows × 6 columns

\n", "
" ], "text/plain": [ " Country Population2020 PopulationNorm PopulationClipNorm \\\n", "no \n", "1 China 1439323776 1.000000e+00 1.000000 \n", "2 India 1380004385 9.587866e-01 1.000000 \n", "3 United States 331002651 2.299705e-01 1.000000 \n", "4 Indonesia 273523615 1.900357e-01 1.000000 \n", "5 Pakistan 220892340 1.534691e-01 1.000000 \n", ".. ... ... ... ... \n", "231 Montserrat 4992 2.911786e-06 0.000084 \n", "232 Falkland Islands 3480 1.861292e-06 0.000054 \n", "233 Niue 1626 5.731862e-07 0.000017 \n", "234 Tokelau 1357 3.862927e-07 0.000011 \n", "235 Holy See 801 0.000000e+00 0.000000 \n", "\n", " PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n", "no \n", "1 1.000000 1.000000 \n", "2 1.000000 1.000000 \n", "3 1.000000 1.000000 \n", "4 1.000000 1.000000 \n", "5 1.000000 1.000000 \n", ".. ... ... \n", "231 0.000036 -0.999927 \n", "232 0.000023 -0.999953 \n", "233 0.000007 -0.999986 \n", "234 0.000005 -0.999990 \n", "235 0.000000 -1.000000 \n", "\n", "[235 rows x 6 columns]" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "min_max_scaler = preprocessing.MinMaxScaler()\n", "\n", "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n", "\n", "countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n", " countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n", " countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n", " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n", " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\n", " [\n", " \"Country\",\n", " \"Population2020\",\n", " \"PopulationNorm\",\n", " \"PopulationClipNorm\",\n", " \"PopulationWinsorizedNorm\",\n", " \"PopulationWinsorizedNorm2\",\n", " ]\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Стандартизация значений" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryPopulation2020PopulationStandPopulationClipStandPopulationWinsorizedStand
no
1China143932377610.4275972.0739333.171659
2India13800043859.9877022.0739333.171659
3United States3310026512.2086272.0739333.171659
4Indonesia2735236151.7823802.0739333.171659
5Pakistan2208923401.3920822.0739333.171659
..................
231Montserrat4992-0.245950-0.795071-0.621969
232Falkland Islands3480-0.245962-0.795158-0.622019
233Niue1626-0.245975-0.795265-0.622080
234Tokelau1357-0.245977-0.795280-0.622089
235Holy See801-0.245982-0.795312-0.622107
\n", "

235 rows × 5 columns

\n", "
" ], "text/plain": [ " Country Population2020 PopulationStand PopulationClipStand \\\n", "no \n", "1 China 1439323776 10.427597 2.073933 \n", "2 India 1380004385 9.987702 2.073933 \n", "3 United States 331002651 2.208627 2.073933 \n", "4 Indonesia 273523615 1.782380 2.073933 \n", "5 Pakistan 220892340 1.392082 2.073933 \n", ".. ... ... ... ... \n", "231 Montserrat 4992 -0.245950 -0.795071 \n", "232 Falkland Islands 3480 -0.245962 -0.795158 \n", "233 Niue 1626 -0.245975 -0.795265 \n", "234 Tokelau 1357 -0.245977 -0.795280 \n", "235 Holy See 801 -0.245982 -0.795312 \n", "\n", " PopulationWinsorizedStand \n", "no \n", "1 3.171659 \n", "2 3.171659 \n", "3 3.171659 \n", "4 3.171659 \n", "5 3.171659 \n", ".. ... \n", "231 -0.621969 \n", "232 -0.622019 \n", "233 -0.622080 \n", "234 -0.622089 \n", "235 -0.622107 \n", "\n", "[235 rows x 5 columns]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "stndart_scaler = preprocessing.StandardScaler()\n", "\n", "countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n", " countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n", " countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n", " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n", ").reshape(countries_norm[\"Population2020\"].shape)\n", "\n", "countries_norm[\n", " [\n", " \"Country\",\n", " \"Population2020\",\n", " \"PopulationStand\",\n", " \"PopulationClipStand\",\n", " \"PopulationWinsorizedStand\",\n", " ]\n", "]" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }