From 01c27ac023bd6339c907ccf7e0dab260d03ee3c3 Mon Sep 17 00:00:00 2001 From: gg12 darfren <frenkiarts@yandex.ru> Date: Wed, 23 Oct 2024 13:43:55 +0400 Subject: [PATCH] Lab3 done --- data/countries-continents-capitals.csv | 12 +- data/world-population-by-country-2020.csv | 2 +- data/world-population-forcast-2020-2050.csv | 2 +- lec3.ipynb | 3453 +++++++++++++++++++ poetry.lock | 132 +- pyproject.toml | 4 + 6 files changed, 3596 insertions(+), 9 deletions(-) create mode 100644 lec3.ipynb diff --git a/data/countries-continents-capitals.csv b/data/countries-continents-capitals.csv index b90f340..61258ec 100644 --- a/data/countries-continents-capitals.csv +++ b/data/countries-continents-capitals.csv @@ -1,4 +1,4 @@ -Country/Territory,Capital,Continent +Country,Capital,Continent Afghanistan,Kabul,Asia Albania,Tirana,Europe Algeria,Algiers,Africa @@ -6,7 +6,7 @@ American Samoa,Pago Pago,Oceania Andorra,Andorra la Vella,Europe Angola,Luanda,Africa Anguilla,The Valley,North America -Antigua and Barbuda,Saint John�??s,North America +Antigua and Barbuda,Saint John�??s,North America Argentina,Buenos Aires,South America Armenia,Yerevan,Asia Aruba,Oranjestad,North America @@ -80,7 +80,7 @@ Greece,Athens,Europe Greenland,Nuuk,North America Grenada,Saint George's,North America Guadeloupe,Basse-Terre,North America -Guam,Hag?�t?�a,Oceania +Guam,Hag?�t?�a,Oceania Guatemala,Guatemala City,North America Guernsey,Saint Peter Port,Europe Guinea,Conakry,Africa @@ -180,7 +180,7 @@ Saint Pierre and Miquelon,Saint-Pierre,North America Saint Vincent and the Grenadines,Kingstown,North America Samoa,Apia,Oceania San Marino,San Marino,Europe -Sao Tome and Principe,S?�o Tom??,Africa +Sao Tome and Principe,S?�o Tom??,Africa Saudi Arabia,Riyadh,Asia Senegal,Dakar,Africa Serbia,Belgrade,Europe @@ -209,7 +209,7 @@ Thailand,Bangkok,Asia Timor-Leste,Dili,Asia Togo,Lom??,Africa Tokelau,Nukunonu,Oceania -Tonga,Nuku�??alofa,Oceania +Tonga,Nuku�??alofa,Oceania Trinidad and Tobago,Port-of-Spain,North America Tunisia,Tunis,Africa Turkey,Ankara,Asia @@ -229,7 +229,7 @@ Vatican City,Vatican City,Europe Venezuela,Caracas,South America Vietnam,Hanoi,Asia Wallis and Futuna,Mata-Utu,Oceania -Western Sahara,El Aai?�n,Africa +Western Sahara,El Aai?�n,Africa Yemen,Sanaa,Asia Zambia,Lusaka,Africa Zimbabwe,Harare,Africa diff --git a/data/world-population-by-country-2020.csv b/data/world-population-by-country-2020.csv index 78c1f46..d923824 100644 --- a/data/world-population-by-country-2020.csv +++ b/data/world-population-by-country-2020.csv @@ -1,4 +1,4 @@ -no,Country (or dependency),Population 2020,Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share +no,Country,Population2020,Yearly,NetChange,Density,LandArea,Migrants,FertRate,MedAge,UrbanPop,WorldShare 1,China,"1,439,323,776",0.39%,"5,540,090",153,"9,388,211","-348,399",1.7,38,61%,18.47% 2,India,"1,380,004,385",0.99%,"13,586,631",464,"2,973,190","-532,687",2.2,28,35%,17.70% 3,United States,"331,002,651",0.59%,"1,937,734",36,"9,147,420","954,806",1.8,38,83%,4.25% diff --git a/data/world-population-forcast-2020-2050.csv b/data/world-population-forcast-2020-2050.csv index 621e50e..a27901d 100644 --- a/data/world-population-forcast-2020-2050.csv +++ b/data/world-population-forcast-2020-2050.csv @@ -1,4 +1,4 @@ -Year,Population,Yearly %,Yearly,Median,Fertility,Density +Year,Population,YearlyPer,Yearly,Median,Fertility,Density 2020,"7,794,798,739",1.10%,"83,000,320",31,2.47,52 2025,"8,184,437,460",0.98%,"77,927,744",32,2.54,55 2030,"8,548,487,400",0.87%,"72,809,988",33,2.62,57 diff --git a/lec3.ipynb b/lec3.ipynb new file mode 100644 index 0000000..e1fc869 --- /dev/null +++ b/lec3.ipynb @@ -0,0 +1,3453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Унитарное кодирование\n", + "\n", + "Преобразование категориального признака в несколько бинарных признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Загрузка набора данных Titanic" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Population2020</th>\n", + " <th>Yearly</th>\n", + " <th>NetChange</th>\n", + " <th>Density</th>\n", + " <th>LandArea</th>\n", + " <th>Migrants</th>\n", + " <th>FertRate</th>\n", + " <th>MedAge</th>\n", + " <th>UrbanPop</th>\n", + " <th>WorldShare</th>\n", + " <th>Net Change</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>China</td>\n", + " <td>1439323776</td>\n", + " <td>0.39</td>\n", + " <td>5,540,090</td>\n", + " <td>153</td>\n", + " <td>9388211</td>\n", + " <td>-348,399</td>\n", + " <td>1.7</td>\n", + " <td>38</td>\n", + " <td>61%</td>\n", + " <td>18.47%</td>\n", + " <td>5540090</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>India</td>\n", + " <td>1380004385</td>\n", + " <td>0.99</td>\n", + " <td>13,586,631</td>\n", + " <td>464</td>\n", + " <td>2973190</td>\n", + " <td>-532,687</td>\n", + " <td>2.2</td>\n", + " <td>28</td>\n", + " <td>35%</td>\n", + " <td>17.70%</td>\n", + " <td>13586631</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>United States</td>\n", + " <td>331002651</td>\n", + " <td>0.59</td>\n", + " <td>1,937,734</td>\n", + " <td>36</td>\n", + " <td>9147420</td>\n", + " <td>954,806</td>\n", + " <td>1.8</td>\n", + " <td>38</td>\n", + " <td>83%</td>\n", + " <td>4.25%</td>\n", + " <td>1937734</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Indonesia</td>\n", + " <td>273523615</td>\n", + " <td>1.07</td>\n", + " <td>2,898,047</td>\n", + " <td>151</td>\n", + " <td>1811570</td>\n", + " <td>-98,955</td>\n", + " <td>2.3</td>\n", + " <td>30</td>\n", + " <td>56%</td>\n", + " <td>3.51%</td>\n", + " <td>2898047</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Pakistan</td>\n", + " <td>220892340</td>\n", + " <td>2.00</td>\n", + " <td>4,327,022</td>\n", + " <td>287</td>\n", + " <td>770880</td>\n", + " <td>-233,379</td>\n", + " <td>3.6</td>\n", + " <td>23</td>\n", + " <td>35%</td>\n", + " <td>2.83%</td>\n", + " <td>4327022</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>231</th>\n", + " <td>Montserrat</td>\n", + " <td>4992</td>\n", + " <td>0.06</td>\n", + " <td>3</td>\n", + " <td>50</td>\n", + " <td>100</td>\n", + " <td>NaN</td>\n", + " <td>N.A.</td>\n", + " <td>N.A.</td>\n", + " <td>10%</td>\n", + " <td>0.00%</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>232</th>\n", + " <td>Falkland Islands</td>\n", + " <td>3480</td>\n", + " <td>3.05</td>\n", + " <td>103</td>\n", + " <td>0</td>\n", + " <td>12170</td>\n", + " <td>NaN</td>\n", + " <td>N.A.</td>\n", + " <td>N.A.</td>\n", + " <td>66%</td>\n", + " <td>0.00%</td>\n", + " <td>103</td>\n", + " </tr>\n", + " <tr>\n", + " <th>233</th>\n", + " <td>Niue</td>\n", + " <td>1626</td>\n", + " <td>0.68</td>\n", + " <td>11</td>\n", + " <td>6</td>\n", + " <td>260</td>\n", + " <td>NaN</td>\n", + " <td>N.A.</td>\n", + " <td>N.A.</td>\n", + " <td>46%</td>\n", + " <td>0.00%</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>234</th>\n", + " <td>Tokelau</td>\n", + " <td>1357</td>\n", + " <td>1.27</td>\n", + " <td>17</td>\n", + " <td>136</td>\n", + " <td>10</td>\n", + " <td>NaN</td>\n", + " <td>N.A.</td>\n", + " <td>N.A.</td>\n", + " <td>0%</td>\n", + " <td>0.00%</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>235</th>\n", + " <td>Holy See</td>\n", + " <td>801</td>\n", + " <td>0.25</td>\n", + " <td>2</td>\n", + " <td>2,003</td>\n", + " <td>0</td>\n", + " <td>NaN</td>\n", + " <td>N.A.</td>\n", + " <td>N.A.</td>\n", + " <td>N.A.</td>\n", + " <td>0.00%</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>235 rows × 12 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Country Population2020 Yearly NetChange Density LandArea \\\n", + "no \n", + "1 China 1439323776 0.39 5,540,090 153 9388211 \n", + "2 India 1380004385 0.99 13,586,631 464 2973190 \n", + "3 United States 331002651 0.59 1,937,734 36 9147420 \n", + "4 Indonesia 273523615 1.07 2,898,047 151 1811570 \n", + "5 Pakistan 220892340 2.00 4,327,022 287 770880 \n", + ".. ... ... ... ... ... ... \n", + "231 Montserrat 4992 0.06 3 50 100 \n", + "232 Falkland Islands 3480 3.05 103 0 12170 \n", + "233 Niue 1626 0.68 11 6 260 \n", + "234 Tokelau 1357 1.27 17 136 10 \n", + "235 Holy See 801 0.25 2 2,003 0 \n", + "\n", + " Migrants FertRate MedAge UrbanPop WorldShare Net Change \n", + "no \n", + "1 -348,399 1.7 38 61% 18.47% 5540090 \n", + "2 -532,687 2.2 28 35% 17.70% 13586631 \n", + "3 954,806 1.8 38 83% 4.25% 1937734 \n", + "4 -98,955 2.3 30 56% 3.51% 2898047 \n", + "5 -233,379 3.6 23 35% 2.83% 4327022 \n", + ".. ... ... ... ... ... ... \n", + "231 NaN N.A. N.A. 10% 0.00% 3 \n", + "232 NaN N.A. N.A. 66% 0.00% 103 \n", + "233 NaN N.A. N.A. 46% 0.00% 11 \n", + "234 NaN N.A. N.A. 0% 0.00% 17 \n", + "235 NaN N.A. N.A. N.A. 0.00% 2 \n", + "\n", + "[235 rows x 12 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "countries = pd.read_csv(\n", + " \"data/world-population-by-country-2020.csv\", index_col=\"no\"\n", + ")\n", + "\n", + "countries[\"Population2020\"] = countries[\"Population2020\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "countries[\"Net Change\"] = countries[\"NetChange\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "countries[\"Yearly\"] = countries[\"Yearly\"].apply(\n", + " lambda x: float(\"\".join(x.rstrip(\"%\")))\n", + ")\n", + "countries[\"LandArea\"] = countries[\"LandArea\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "countries" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Унитарное кодирование признаков Пол (Sex) и Порт посадки (Embarked)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Кодирование" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "\n", + "# encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "# encoded_values = encoder.fit_transform(titanic[[\"Embarked\", \"Sex\"]])\n", + "\n", + "# encoded_columns = encoder.get_feature_names_out([\"Embarked\", \"Sex\"])\n", + "\n", + "# encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "# encoded_values_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Добавление признаков в исходный Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# titanic = pd.concat([titanic, encoded_values_df], axis=1)\n", + "\n", + "# titanic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Дискретизация признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Равномерное разделение данных на 3 группы" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "labels = [\"Small\", \"Middle\", \"Big\"]\n", + "num_bins = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0. , 5458956.66666667, 10917913.33333333,\n", + " 16376870. ]),\n", + " array([229, 5, 1]))" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hist1, bins1 = np.histogram(\n", + " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=num_bins\n", + ")\n", + "bins1, hist1" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>(5458956.667, 10917913.333]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>(5458956.667, 10917913.333]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>(5458956.667, 10917913.333]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>(10917913.333, 16376870.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>(0.0, 5458956.667]</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 (5458956.667, 10917913.333]\n", + "2 2973190 (0.0, 5458956.667]\n", + "3 9147420 (5458956.667, 10917913.333]\n", + "4 1811570 (0.0, 5458956.667]\n", + "5 770880 (0.0, 5458956.667]\n", + "6 8358140 (5458956.667, 10917913.333]\n", + "7 910770 (0.0, 5458956.667]\n", + "8 130170 (0.0, 5458956.667]\n", + "9 16376870 (10917913.333, 16376870.0]\n", + "10 1943950 (0.0, 5458956.667]\n", + "11 364555 (0.0, 5458956.667]\n", + "12 1000000 (0.0, 5458956.667]\n", + "13 298170 (0.0, 5458956.667]\n", + "14 995450 (0.0, 5458956.667]\n", + "15 310070 (0.0, 5458956.667]\n", + "16 2267050 (0.0, 5458956.667]\n", + "17 769630 (0.0, 5458956.667]\n", + "18 1628550 (0.0, 5458956.667]\n", + "19 348560 (0.0, 5458956.667]\n", + "20 510890 (0.0, 5458956.667]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat(\n", + " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1))], axis=1\n", + ").head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 Middle\n", + "2 2973190 Small\n", + "3 9147420 Middle\n", + "4 1811570 Small\n", + "5 770880 Small\n", + "6 8358140 Middle\n", + "7 910770 Small\n", + "8 130170 Small\n", + "9 16376870 Big\n", + "10 1943950 Small\n", + "11 364555 Small\n", + "12 1000000 Small\n", + "13 298170 Small\n", + "14 995450 Small\n", + "15 310070 Small\n", + "16 2267050 Small\n", + "17 769630 Small\n", + "18 1628550 Small\n", + "19 348560 Small\n", + "20 510890 Small" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins1), labels=labels)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0., 4000000., 8000000., 12000000.]),\n", + " array([229, 1, 4, 1]))" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels = [\"Small\", \"Middle\", \"Big\"]\n", + "bins2 = np.linspace(0, 12000000, 4)\n", + "\n", + "tmp_bins2 = np.digitize(\n", + " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins2\n", + ")\n", + "\n", + "hist2 = np.bincount(tmp_bins2 - 1)\n", + "\n", + "bins2, hist2" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>(8000000.0, 12000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>(8000000.0, 12000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>(8000000.0, 12000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>(0.0, 4000000.0]</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 (8000000.0, 12000000.0]\n", + "2 2973190 (0.0, 4000000.0]\n", + "3 9147420 (8000000.0, 12000000.0]\n", + "4 1811570 (0.0, 4000000.0]\n", + "5 770880 (0.0, 4000000.0]\n", + "6 8358140 (8000000.0, 12000000.0]\n", + "7 910770 (0.0, 4000000.0]\n", + "8 130170 (0.0, 4000000.0]\n", + "9 16376870 NaN\n", + "10 1943950 (0.0, 4000000.0]\n", + "11 364555 (0.0, 4000000.0]\n", + "12 1000000 (0.0, 4000000.0]\n", + "13 298170 (0.0, 4000000.0]\n", + "14 995450 (0.0, 4000000.0]\n", + "15 310070 (0.0, 4000000.0]\n", + "16 2267050 (0.0, 4000000.0]\n", + "17 769630 (0.0, 4000000.0]\n", + "18 1628550 (0.0, 4000000.0]\n", + "19 348560 (0.0, 4000000.0]\n", + "20 510890 (0.0, 4000000.0]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2))], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>Small</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 Big\n", + "2 2973190 Small\n", + "3 9147420 Big\n", + "4 1811570 Small\n", + "5 770880 Small\n", + "6 8358140 Big\n", + "7 910770 Small\n", + "8 130170 Small\n", + "9 16376870 NaN\n", + "10 1943950 Small\n", + "11 364555 Small\n", + "12 1000000 Small\n", + "13 298170 Small\n", + "14 995450 Small\n", + "15 310070 Small\n", + "16 2267050 Small\n", + "17 769630 Small\n", + "18 1628550 Small\n", + "19 348560 Small\n", + "20 510890 Small" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat(\n", + " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins2), labels=labels)],\n", + " axis=1,\n", + ").head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n", + " array([52, 77, 56, 44, 6]))" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n", + "hist3, bins3 = np.histogram(\n", + "\n", + " countries[\"LandArea\"].fillna(countries[\"LandArea\"].median()), bins=[0, 1000, 100000, 500000, 3000000, np.inf]\n", + ")\n", + "\n", + "\n", + "bins3, hist3" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>(3000000.0, inf]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>(3000000.0, inf]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>(3000000.0, inf]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>(100000.0, 500000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>(3000000.0, inf]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>(100000.0, 500000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>(100000.0, 500000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>(100000.0, 500000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>(100000.0, 500000.0]</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>(500000.0, 3000000.0]</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 (3000000.0, inf]\n", + "2 2973190 (500000.0, 3000000.0]\n", + "3 9147420 (3000000.0, inf]\n", + "4 1811570 (500000.0, 3000000.0]\n", + "5 770880 (500000.0, 3000000.0]\n", + "6 8358140 (3000000.0, inf]\n", + "7 910770 (500000.0, 3000000.0]\n", + "8 130170 (100000.0, 500000.0]\n", + "9 16376870 (3000000.0, inf]\n", + "10 1943950 (500000.0, 3000000.0]\n", + "11 364555 (100000.0, 500000.0]\n", + "12 1000000 (500000.0, 3000000.0]\n", + "13 298170 (100000.0, 500000.0]\n", + "14 995450 (500000.0, 3000000.0]\n", + "15 310070 (100000.0, 500000.0]\n", + "16 2267050 (500000.0, 3000000.0]\n", + "17 769630 (500000.0, 3000000.0]\n", + "18 1628550 (500000.0, 3000000.0]\n", + "19 348560 (100000.0, 500000.0]\n", + "20 510890 (500000.0, 3000000.0]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3))], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 Giant\n", + "2 2973190 Big\n", + "3 9147420 Giant\n", + "4 1811570 Big\n", + "5 770880 Big\n", + "6 8358140 Giant\n", + "7 910770 Big\n", + "8 130170 Middle\n", + "9 16376870 Giant\n", + "10 1943950 Big\n", + "11 364555 Middle\n", + "12 1000000 Big\n", + "13 298170 Middle\n", + "14 995450 Big\n", + "15 310070 Middle\n", + "16 2267050 Big\n", + "17 769630 Big\n", + "18 1628550 Big\n", + "19 348560 Middle\n", + "20 510890 Big" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat(\n", + " [countries[\"LandArea\"], pd.cut(countries[\"LandArea\"], list(bins3), labels=labels2)],\n", + " axis=1,\n", + ").head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Квантильное разделение данных на 5 групп\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>3</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 4\n", + "2 2973190 4\n", + "3 9147420 4\n", + "4 1811570 4\n", + "5 770880 4\n", + "6 8358140 4\n", + "7 910770 4\n", + "8 130170 2\n", + "9 16376870 4\n", + "10 1943950 4\n", + "11 364555 3\n", + "12 1000000 4\n", + "13 298170 3\n", + "14 995450 4\n", + "15 310070 3\n", + "16 2267050 4\n", + "17 769630 4\n", + "18 1628550 4\n", + "19 348560 3\n", + "20 510890 3" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=False)], axis=1).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>LandArea</th>\n", + " <th>LandArea</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>9388211</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2973190</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>9147420</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1811570</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>770880</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>8358140</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>910770</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>130170</td>\n", + " <td>Middle</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>16376870</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>1943950</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>364555</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>1000000</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>298170</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>995450</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>310070</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>2267050</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>769630</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>1628550</td>\n", + " <td>Giant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>348560</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>510890</td>\n", + " <td>Big</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " LandArea LandArea\n", + "no \n", + "1 9388211 Giant\n", + "2 2973190 Giant\n", + "3 9147420 Giant\n", + "4 1811570 Giant\n", + "5 770880 Giant\n", + "6 8358140 Giant\n", + "7 910770 Giant\n", + "8 130170 Middle\n", + "9 16376870 Giant\n", + "10 1943950 Giant\n", + "11 364555 Big\n", + "12 1000000 Giant\n", + "13 298170 Big\n", + "14 995450 Giant\n", + "15 310070 Big\n", + "16 2267050 Giant\n", + "17 769630 Giant\n", + "18 1628550 Giant\n", + "19 348560 Big\n", + "20 510890 Big" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([countries[\"LandArea\"], pd.qcut(countries[\"LandArea\"], q=5, labels=labels2)], axis=1).head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Пример конструирования признаков на основе существующих\n", + "\n", + "Title - обращение к пассажиру (Mr, Mrs, Miss)\n", + "\n", + "Is_married - замужняя ли женщина\n", + "\n", + "Cabin_type - палуба (тип каюты)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# titanic_cl = titanic.drop(\n", + "# [\"Embarked_Q\", \"Embarked_S\", \"Embarked_nan\", \"Sex_male\"], axis=1, errors=\"ignore\"\n", + "# )\n", + "# titanic_cl = titanic_cl.dropna()\n", + "\n", + "# titanic_cl[\"Title\"] = [\n", + "# i.split(\",\")[1].split(\".\")[0].strip() for i in titanic_cl[\"Name\"]\n", + "# ]\n", + "\n", + "# titanic_cl[\"Is_married\"] = [1 if i == \"Mrs\" else 0 for i in titanic_cl[\"Title\"]]\n", + "\n", + "# titanic_cl[\"Cabin_type\"] = [i[0] for i in titanic_cl[\"Cabin\"]]\n", + "\n", + "# titanic_cl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Пример использования библиотеки Featuretools для автоматического конструирования (синтеза) признаков\n", + "\n", + "https://featuretools.alteryx.com/en/stable/getting_started/using_entitysets.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Загрузка данных\n", + "\n", + "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n", + "\n", + "Используется только 100 первых заказов и связанные с ними объекты\n", + "\n", + "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "( no Country Population2020 Yearly NetChange Density \\\n", + " 0 1 China 1439323776 0.39 5540090 153 \n", + " 1 2 India 1380004385 0.99 13586631 464 \n", + " 2 3 United States 331002651 0.59 1937734 36 \n", + " 3 4 Indonesia 273523615 1.07 2898047 151 \n", + " 4 5 Pakistan 220892340 2.00 4327022 287 \n", + " .. ... ... ... ... ... ... \n", + " 230 231 Montserrat 4992 0.06 3 50 \n", + " 231 232 Falkland Islands 3480 3.05 103 0 \n", + " 232 233 Niue 1626 0.68 11 6 \n", + " 233 234 Tokelau 1357 1.27 17 136 \n", + " 234 235 Holy See 801 0.25 2 2,003 \n", + " \n", + " LandArea \n", + " 0 9388211 \n", + " 1 2973190 \n", + " 2 9147420 \n", + " 3 1811570 \n", + " 4 770880 \n", + " .. ... \n", + " 230 100 \n", + " 231 12170 \n", + " 232 260 \n", + " 233 10 \n", + " 234 0 \n", + " \n", + " [235 rows x 7 columns],\n", + " Year Population YearlyPer Yearly Median Fertility Density\n", + " 0 2020 7794798739 1.10 83000320 31 2.47 52\n", + " 1 2025 8184437460 0.98 77927744 32 2.54 55\n", + " 2 2030 8548487400 0.87 72809988 33 2.62 57\n", + " 3 2035 8887524213 0.78 67807363 34 2.70 60\n", + " 4 2040 9198847240 0.69 62264605 35 2.77 62\n", + " 5 2045 9481803274 0.61 56591207 35 2.85 64\n", + " 6 2050 9735033990 0.53 50646143 36 2.95 65,\n", + " Country Capital Continent\n", + " 0 Afghanistan Kabul Asia\n", + " 1 Albania Tirana Europe\n", + " 2 Algeria Algiers Africa\n", + " 3 American Samoa Pago Pago Oceania\n", + " 4 Andorra Andorra la Vella Europe\n", + " .. ... ... ...\n", + " 229 Wallis and Futuna Mata-Utu Oceania\n", + " 230 Western Sahara El Aai?�n Africa\n", + " 231 Yemen Sanaa Asia\n", + " 232 Zambia Lusaka Africa\n", + " 233 Zimbabwe Harare Africa\n", + " \n", + " [234 rows x 3 columns])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import featuretools as ft\n", + "from woodwork.logical_types import Categorical, Datetime\n", + "\n", + "info = pd.read_csv(\"data/world-population-by-country-2020.csv\")\n", + "forcast = pd.read_csv(\"data/world-population-forcast-2020-2050.csv\")\n", + "capitals = pd.read_csv(\"data/countries-continents-capitals.csv\", encoding=\"ISO-8859-1\")\n", + "forcast[\"Population\"] = forcast[\"Population\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "forcast[\"YearlyPer\"] = forcast[\"YearlyPer\"].apply(\n", + " lambda x: float(\"\".join(x.rstrip(\"%\")))\n", + ")\n", + "forcast[\"Yearly\"] = forcast[\"Yearly\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "info = info.drop([\"Migrants\", \"FertRate\", \"MedAge\", \"UrbanPop\", \"WorldShare\"], axis=1)\n", + "info[\"Population2020\"] = info[\"Population2020\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "info[\"Yearly\"] = info[\"Yearly\"].apply(\n", + " lambda x: float(\"\".join(x.rstrip(\"%\")))\n", + ")\n", + "info[\"NetChange\"] = info[\"NetChange\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "info[\"LandArea\"] = info[\"LandArea\"].apply(\n", + " lambda x: int(\"\".join(x.split(\",\")))\n", + ")\n", + "\n", + "info, forcast, capitals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Создание сущностей в featuretools\n", + "\n", + "Добавление dataframe'ов с данными в EntitySet с указанием параметров: название сущности (таблицы), первичный ключ, категориальные атрибуты (в том числе даты)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "c:\\Users\\frenk\\OneDrive\\Рабочий стол\\MII_Salin_Oleg_PIbd-33\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n" + ] + }, + { + "data": { + "text/plain": [ + "Entityset: countries\n", + " DataFrames:\n", + " countries [Rows: 235, Columns: 7]\n", + " capitals [Rows: 234, Columns: 3]\n", + " forcast [Rows: 7, Columns: 8]\n", + " Relationships:\n", + " No relationships" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "es = ft.EntitySet(id=\"countries\")\n", + "\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"countries\",\n", + " dataframe=info,\n", + " index=\"no\",\n", + " logical_types={\n", + " \"Country\": Categorical,\n", + " },\n", + ")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"capitals\",\n", + " dataframe=capitals,\n", + " index=\"Country\",\n", + " logical_types={\n", + " \"Country\": Categorical,\n", + " \"Capital\": Categorical,\n", + " \"Continent\": Categorical,\n", + " },\n", + ")\n", + "es = es.add_dataframe(\n", + " dataframe_name=\"forcast\",\n", + " dataframe=forcast,\n", + " index=\"forcast_id\",\n", + " make_index=True,\n", + " logical_types={\n", + " \"Year\": Datetime,\n", + " },\n", + ")\n", + "\n", + "es" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Настройка связей между сущностями featuretools\n", + "\n", + "Настройка связей между таблицами на уровне ключей\n", + "\n", + "Связь указывается от родителя к потомкам (таблица-родитель, первичный ключ, таблица-потомок, внешний ключ)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entityset: countries\n", + " DataFrames:\n", + " countries [Rows: 235, Columns: 7]\n", + " capitals [Rows: 234, Columns: 3]\n", + " forcast [Rows: 7, Columns: 8]\n", + " Relationships:\n", + " countries.Country -> capitals.Country" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "es = es.add_relationship(\"capitals\", \"Country\", \"countries\", \"Country\")\n", + "\n", + "es" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Автоматическое конструирование признаков с помощью featuretools\n", + "\n", + "Библиотека применят различные функции агрегации и трансформации к атрибутам таблицы order_items с учетом отношений\n", + "\n", + "Результат помещается в Dataframe feature_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Population2020</th>\n", + " <th>Yearly</th>\n", + " <th>NetChange</th>\n", + " <th>LandArea</th>\n", + " <th>capitals.Capital</th>\n", + " <th>capitals.Continent</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>China</td>\n", + " <td>1439323776</td>\n", + " <td>0.39</td>\n", + " <td>5540090</td>\n", + " <td>9388211</td>\n", + " <td>Beijing</td>\n", + " <td>Asia</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>India</td>\n", + " <td>1380004385</td>\n", + " <td>0.99</td>\n", + " <td>13586631</td>\n", + " <td>2973190</td>\n", + " <td>New Delhi</td>\n", + " <td>Asia</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>United States</td>\n", + " <td>331002651</td>\n", + " <td>0.59</td>\n", + " <td>1937734</td>\n", + " <td>9147420</td>\n", + " <td>Washington, D.C.</td>\n", + " <td>North America</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Indonesia</td>\n", + " <td>273523615</td>\n", + " <td>1.07</td>\n", + " <td>2898047</td>\n", + " <td>1811570</td>\n", + " <td>Jakarta</td>\n", + " <td>Asia</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Pakistan</td>\n", + " <td>220892340</td>\n", + " <td>2.00</td>\n", + " <td>4327022</td>\n", + " <td>770880</td>\n", + " <td>Islamabad</td>\n", + " <td>Asia</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>231</th>\n", + " <td>Montserrat</td>\n", + " <td>4992</td>\n", + " <td>0.06</td>\n", + " <td>3</td>\n", + " <td>100</td>\n", + " <td>Brades</td>\n", + " <td>North America</td>\n", + " </tr>\n", + " <tr>\n", + " <th>232</th>\n", + " <td>Falkland Islands</td>\n", + " <td>3480</td>\n", + " <td>3.05</td>\n", + " <td>103</td>\n", + " <td>12170</td>\n", + " <td>Stanley</td>\n", + " <td>South America</td>\n", + " </tr>\n", + " <tr>\n", + " <th>233</th>\n", + " <td>Niue</td>\n", + " <td>1626</td>\n", + " <td>0.68</td>\n", + " <td>11</td>\n", + " <td>260</td>\n", + " <td>Alofi</td>\n", + " <td>Oceania</td>\n", + " </tr>\n", + " <tr>\n", + " <th>234</th>\n", + " <td>Tokelau</td>\n", + " <td>1357</td>\n", + " <td>1.27</td>\n", + " <td>17</td>\n", + " <td>10</td>\n", + " <td>Nukunonu</td>\n", + " <td>Oceania</td>\n", + " </tr>\n", + " <tr>\n", + " <th>235</th>\n", + " <td>Holy See</td>\n", + " <td>801</td>\n", + " <td>0.25</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>235 rows × 7 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Country Population2020 Yearly NetChange LandArea \\\n", + "no \n", + "1 China 1439323776 0.39 5540090 9388211 \n", + "2 India 1380004385 0.99 13586631 2973190 \n", + "3 United States 331002651 0.59 1937734 9147420 \n", + "4 Indonesia 273523615 1.07 2898047 1811570 \n", + "5 Pakistan 220892340 2.00 4327022 770880 \n", + ".. ... ... ... ... ... \n", + "231 Montserrat 4992 0.06 3 100 \n", + "232 Falkland Islands 3480 3.05 103 12170 \n", + "233 Niue 1626 0.68 11 260 \n", + "234 Tokelau 1357 1.27 17 10 \n", + "235 Holy See 801 0.25 2 0 \n", + "\n", + " capitals.Capital capitals.Continent \n", + "no \n", + "1 Beijing Asia \n", + "2 New Delhi Asia \n", + "3 Washington, D.C. North America \n", + "4 Jakarta Asia \n", + "5 Islamabad Asia \n", + ".. ... ... \n", + "231 Brades North America \n", + "232 Stanley South America \n", + "233 Alofi Oceania \n", + "234 Nukunonu Oceania \n", + "235 NaN NaN \n", + "\n", + "[235 rows x 7 columns]" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name=\"countries\",\n", + " max_depth=1,\n", + ")\n", + "\n", + "feature_matrix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Полученные признаки\n", + "\n", + "Список колонок полученного dataframe'а" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[<Feature: Country>,\n", + " <Feature: Population2020>,\n", + " <Feature: Yearly>,\n", + " <Feature: NetChange>,\n", + " <Feature: LandArea>,\n", + " <Feature: capitals.Capital>,\n", + " <Feature: capitals.Continent>]" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_defs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Отсечение значений признаков" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Определение выбросов с помощью boxplot" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<Axes: >" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "countries.boxplot(column=\"Population2020\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Отсечение данных для признака Возраст, значение которых больше 65 лет" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Population2020</th>\n", + " <th>PopulationClip</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>China</td>\n", + " <td>1439323776</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>India</td>\n", + " <td>1380004385</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>United States</td>\n", + " <td>331002651</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Indonesia</td>\n", + " <td>273523615</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Pakistan</td>\n", + " <td>220892340</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Brazil</td>\n", + " <td>212559417</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Nigeria</td>\n", + " <td>206139589</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Bangladesh</td>\n", + " <td>164689383</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Russia</td>\n", + " <td>145934462</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Mexico</td>\n", + " <td>128932753</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Japan</td>\n", + " <td>126476461</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Ethiopia</td>\n", + " <td>114963588</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Philippines</td>\n", + " <td>109581078</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Egypt</td>\n", + " <td>102334404</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Vietnam</td>\n", + " <td>97338579</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>DR Congo</td>\n", + " <td>89561403</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Turkey</td>\n", + " <td>84339067</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Iran</td>\n", + " <td>83992949</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>Germany</td>\n", + " <td>83783942</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>Thailand</td>\n", + " <td>69799978</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>United Kingdom</td>\n", + " <td>67886011</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>France</td>\n", + " <td>65273511</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>Italy</td>\n", + " <td>60461826</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>Tanzania</td>\n", + " <td>59734218</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>South Africa</td>\n", + " <td>59308690</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>Myanmar</td>\n", + " <td>54409800</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>Kenya</td>\n", + " <td>53771296</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>South Korea</td>\n", + " <td>51269185</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>Colombia</td>\n", + " <td>50882891</td>\n", + " <td>50000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Country Population2020 PopulationClip\n", + "no \n", + "1 China 1439323776 50000000\n", + "2 India 1380004385 50000000\n", + "3 United States 331002651 50000000\n", + "4 Indonesia 273523615 50000000\n", + "5 Pakistan 220892340 50000000\n", + "6 Brazil 212559417 50000000\n", + "7 Nigeria 206139589 50000000\n", + "8 Bangladesh 164689383 50000000\n", + "9 Russia 145934462 50000000\n", + "10 Mexico 128932753 50000000\n", + "11 Japan 126476461 50000000\n", + "12 Ethiopia 114963588 50000000\n", + "13 Philippines 109581078 50000000\n", + "14 Egypt 102334404 50000000\n", + "15 Vietnam 97338579 50000000\n", + "16 DR Congo 89561403 50000000\n", + "17 Turkey 84339067 50000000\n", + "18 Iran 83992949 50000000\n", + "19 Germany 83783942 50000000\n", + "20 Thailand 69799978 50000000\n", + "21 United Kingdom 67886011 50000000\n", + "22 France 65273511 50000000\n", + "23 Italy 60461826 50000000\n", + "24 Tanzania 59734218 50000000\n", + "25 South Africa 59308690 50000000\n", + "26 Myanmar 54409800 50000000\n", + "27 Kenya 53771296 50000000\n", + "28 South Korea 51269185 50000000\n", + "29 Colombia 50882891 50000000" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "countries_norm = countries.copy()\n", + "\n", + "countries_norm[\"PopulationClip\"] = countries_norm[\"Population2020\"].clip(0, 50000000);\n", + "\n", + "countries_norm[countries_norm[\"Population2020\"] > 50000000][\n", + " [\"Country\", \"Population2020\", \"PopulationClip\"]\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Винсоризация признака Возраст" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "111195830.99999991\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Population2020</th>\n", + " <th>PopulationWinsorized</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>China</td>\n", + " <td>1439323776</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>India</td>\n", + " <td>1380004385</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>United States</td>\n", + " <td>331002651</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Indonesia</td>\n", + " <td>273523615</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Pakistan</td>\n", + " <td>220892340</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>Brazil</td>\n", + " <td>212559417</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>Nigeria</td>\n", + " <td>206139589</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>Bangladesh</td>\n", + " <td>164689383</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Russia</td>\n", + " <td>145934462</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>Mexico</td>\n", + " <td>128932753</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>Japan</td>\n", + " <td>126476461</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>Ethiopia</td>\n", + " <td>114963588</td>\n", + " <td>114963588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>Philippines</td>\n", + " <td>109581078</td>\n", + " <td>109581078</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>Egypt</td>\n", + " <td>102334404</td>\n", + " <td>102334404</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>Vietnam</td>\n", + " <td>97338579</td>\n", + " <td>97338579</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>DR Congo</td>\n", + " <td>89561403</td>\n", + " <td>89561403</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>Turkey</td>\n", + " <td>84339067</td>\n", + " <td>84339067</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>Iran</td>\n", + " <td>83992949</td>\n", + " <td>83992949</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>Germany</td>\n", + " <td>83783942</td>\n", + " <td>83783942</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>Thailand</td>\n", + " <td>69799978</td>\n", + " <td>69799978</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>United Kingdom</td>\n", + " <td>67886011</td>\n", + " <td>67886011</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>France</td>\n", + " <td>65273511</td>\n", + " <td>65273511</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>Italy</td>\n", + " <td>60461826</td>\n", + " <td>60461826</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>Tanzania</td>\n", + " <td>59734218</td>\n", + " <td>59734218</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>South Africa</td>\n", + " <td>59308690</td>\n", + " <td>59308690</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>Myanmar</td>\n", + " <td>54409800</td>\n", + " <td>54409800</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>Kenya</td>\n", + " <td>53771296</td>\n", + " <td>53771296</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>South Korea</td>\n", + " <td>51269185</td>\n", + " <td>51269185</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>Colombia</td>\n", + " <td>50882891</td>\n", + " <td>50882891</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Country Population2020 PopulationWinsorized\n", + "no \n", + "1 China 1439323776 114963588\n", + "2 India 1380004385 114963588\n", + "3 United States 331002651 114963588\n", + "4 Indonesia 273523615 114963588\n", + "5 Pakistan 220892340 114963588\n", + "6 Brazil 212559417 114963588\n", + "7 Nigeria 206139589 114963588\n", + "8 Bangladesh 164689383 114963588\n", + "9 Russia 145934462 114963588\n", + "10 Mexico 128932753 114963588\n", + "11 Japan 126476461 114963588\n", + "12 Ethiopia 114963588 114963588\n", + "13 Philippines 109581078 109581078\n", + "14 Egypt 102334404 102334404\n", + "15 Vietnam 97338579 97338579\n", + "16 DR Congo 89561403 89561403\n", + "17 Turkey 84339067 84339067\n", + "18 Iran 83992949 83992949\n", + "19 Germany 83783942 83783942\n", + "20 Thailand 69799978 69799978\n", + "21 United Kingdom 67886011 67886011\n", + "22 France 65273511 65273511\n", + "23 Italy 60461826 60461826\n", + "24 Tanzania 59734218 59734218\n", + "25 South Africa 59308690 59308690\n", + "26 Myanmar 54409800 54409800\n", + "27 Kenya 53771296 53771296\n", + "28 South Korea 51269185 51269185\n", + "29 Colombia 50882891 50882891" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.stats.mstats import winsorize\n", + "\n", + "print(countries_norm[\"Population2020\"].quantile(q=0.95))\n", + "\n", + "countries_norm[\"PopulationWinsorized\"] = winsorize(\n", + " countries_norm[\"Population2020\"].fillna(countries_norm[\"Population2020\"].mean()),\n", + " (0, 0.05),\n", + " inplace=False,\n", + ")\n", + "\n", + "countries_norm[countries_norm[\"Population2020\"] > 50000000][\n", + " [\"Country\", \"Population2020\", \"PopulationWinsorized\"]\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Нормализация значений" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Population2020</th>\n", + " <th>PopulationNorm</th>\n", + " <th>PopulationClipNorm</th>\n", + " <th>PopulationWinsorizedNorm</th>\n", + " <th>PopulationWinsorizedNorm2</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>China</td>\n", + " <td>1439323776</td>\n", + " <td>1.000000e+00</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>India</td>\n", + " <td>1380004385</td>\n", + " <td>9.587866e-01</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>United States</td>\n", + " <td>331002651</td>\n", + " <td>2.299705e-01</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Indonesia</td>\n", + " <td>273523615</td>\n", + " <td>1.900357e-01</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Pakistan</td>\n", + " <td>220892340</td>\n", + " <td>1.534691e-01</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>231</th>\n", + " <td>Montserrat</td>\n", + " <td>4992</td>\n", + " <td>2.911786e-06</td>\n", + " <td>0.000084</td>\n", + " <td>0.000036</td>\n", + " <td>-0.999927</td>\n", + " </tr>\n", + " <tr>\n", + " <th>232</th>\n", + " <td>Falkland Islands</td>\n", + " <td>3480</td>\n", + " <td>1.861292e-06</td>\n", + " <td>0.000054</td>\n", + " <td>0.000023</td>\n", + " <td>-0.999953</td>\n", + " </tr>\n", + " <tr>\n", + " <th>233</th>\n", + " <td>Niue</td>\n", + " <td>1626</td>\n", + " <td>5.731862e-07</td>\n", + " <td>0.000017</td>\n", + " <td>0.000007</td>\n", + " <td>-0.999986</td>\n", + " </tr>\n", + " <tr>\n", + " <th>234</th>\n", + " <td>Tokelau</td>\n", + " <td>1357</td>\n", + " <td>3.862927e-07</td>\n", + " <td>0.000011</td>\n", + " <td>0.000005</td>\n", + " <td>-0.999990</td>\n", + " </tr>\n", + " <tr>\n", + " <th>235</th>\n", + " <td>Holy See</td>\n", + " <td>801</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-1.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>235 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Country Population2020 PopulationNorm PopulationClipNorm \\\n", + "no \n", + "1 China 1439323776 1.000000e+00 1.000000 \n", + "2 India 1380004385 9.587866e-01 1.000000 \n", + "3 United States 331002651 2.299705e-01 1.000000 \n", + "4 Indonesia 273523615 1.900357e-01 1.000000 \n", + "5 Pakistan 220892340 1.534691e-01 1.000000 \n", + ".. ... ... ... ... \n", + "231 Montserrat 4992 2.911786e-06 0.000084 \n", + "232 Falkland Islands 3480 1.861292e-06 0.000054 \n", + "233 Niue 1626 5.731862e-07 0.000017 \n", + "234 Tokelau 1357 3.862927e-07 0.000011 \n", + "235 Holy See 801 0.000000e+00 0.000000 \n", + "\n", + " PopulationWinsorizedNorm PopulationWinsorizedNorm2 \n", + "no \n", + "1 1.000000 1.000000 \n", + "2 1.000000 1.000000 \n", + "3 1.000000 1.000000 \n", + "4 1.000000 1.000000 \n", + "5 1.000000 1.000000 \n", + ".. ... ... \n", + "231 0.000036 -0.999927 \n", + "232 0.000023 -0.999953 \n", + "233 0.000007 -0.999986 \n", + "234 0.000005 -0.999990 \n", + "235 0.000000 -1.000000 \n", + "\n", + "[235 rows x 6 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "min_max_scaler = preprocessing.MinMaxScaler()\n", + "\n", + "min_max_scaler_2 = preprocessing.MinMaxScaler(feature_range=(-1, 1))\n", + "\n", + "countries_norm[\"PopulationNorm\"] = min_max_scaler.fit_transform(\n", + " countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\"PopulationClipNorm\"] = min_max_scaler.fit_transform(\n", + " countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\"PopulationWinsorizedNorm\"] = min_max_scaler.fit_transform(\n", + " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\"PopulationWinsorizedNorm2\"] = min_max_scaler_2.fit_transform(\n", + " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\n", + " [\n", + " \"Country\",\n", + " \"Population2020\",\n", + " \"PopulationNorm\",\n", + " \"PopulationClipNorm\",\n", + " \"PopulationWinsorizedNorm\",\n", + " \"PopulationWinsorizedNorm2\",\n", + " ]\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Стандартизация значений" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Population2020</th>\n", + " <th>PopulationStand</th>\n", + " <th>PopulationClipStand</th>\n", + " <th>PopulationWinsorizedStand</th>\n", + " </tr>\n", + " <tr>\n", + " <th>no</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>China</td>\n", + " <td>1439323776</td>\n", + " <td>10.427597</td>\n", + " <td>2.073933</td>\n", + " <td>3.171659</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>India</td>\n", + " <td>1380004385</td>\n", + " <td>9.987702</td>\n", + " <td>2.073933</td>\n", + " <td>3.171659</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>United States</td>\n", + " <td>331002651</td>\n", + " <td>2.208627</td>\n", + " <td>2.073933</td>\n", + " <td>3.171659</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Indonesia</td>\n", + " <td>273523615</td>\n", + " <td>1.782380</td>\n", + " <td>2.073933</td>\n", + " <td>3.171659</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>Pakistan</td>\n", + " <td>220892340</td>\n", + " <td>1.392082</td>\n", + " <td>2.073933</td>\n", + " <td>3.171659</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>231</th>\n", + " <td>Montserrat</td>\n", + " <td>4992</td>\n", + " <td>-0.245950</td>\n", + " <td>-0.795071</td>\n", + " <td>-0.621969</td>\n", + " </tr>\n", + " <tr>\n", + " <th>232</th>\n", + " <td>Falkland Islands</td>\n", + " <td>3480</td>\n", + " <td>-0.245962</td>\n", + " <td>-0.795158</td>\n", + " <td>-0.622019</td>\n", + " </tr>\n", + " <tr>\n", + " <th>233</th>\n", + " <td>Niue</td>\n", + " <td>1626</td>\n", + " <td>-0.245975</td>\n", + " <td>-0.795265</td>\n", + " <td>-0.622080</td>\n", + " </tr>\n", + " <tr>\n", + " <th>234</th>\n", + " <td>Tokelau</td>\n", + " <td>1357</td>\n", + " <td>-0.245977</td>\n", + " <td>-0.795280</td>\n", + " <td>-0.622089</td>\n", + " </tr>\n", + " <tr>\n", + " <th>235</th>\n", + " <td>Holy See</td>\n", + " <td>801</td>\n", + " <td>-0.245982</td>\n", + " <td>-0.795312</td>\n", + " <td>-0.622107</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>235 rows × 5 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Country Population2020 PopulationStand PopulationClipStand \\\n", + "no \n", + "1 China 1439323776 10.427597 2.073933 \n", + "2 India 1380004385 9.987702 2.073933 \n", + "3 United States 331002651 2.208627 2.073933 \n", + "4 Indonesia 273523615 1.782380 2.073933 \n", + "5 Pakistan 220892340 1.392082 2.073933 \n", + ".. ... ... ... ... \n", + "231 Montserrat 4992 -0.245950 -0.795071 \n", + "232 Falkland Islands 3480 -0.245962 -0.795158 \n", + "233 Niue 1626 -0.245975 -0.795265 \n", + "234 Tokelau 1357 -0.245977 -0.795280 \n", + "235 Holy See 801 -0.245982 -0.795312 \n", + "\n", + " PopulationWinsorizedStand \n", + "no \n", + "1 3.171659 \n", + "2 3.171659 \n", + "3 3.171659 \n", + "4 3.171659 \n", + "5 3.171659 \n", + ".. ... \n", + "231 -0.621969 \n", + "232 -0.622019 \n", + "233 -0.622080 \n", + "234 -0.622089 \n", + "235 -0.622107 \n", + "\n", + "[235 rows x 5 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "stndart_scaler = preprocessing.StandardScaler()\n", + "\n", + "countries_norm[\"PopulationStand\"] = stndart_scaler.fit_transform(\n", + " countries_norm[\"Population2020\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\"PopulationClipStand\"] = stndart_scaler.fit_transform(\n", + " countries_norm[\"PopulationClip\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\"PopulationWinsorizedStand\"] = stndart_scaler.fit_transform(\n", + " countries_norm[\"PopulationWinsorized\"].to_numpy().reshape(-1, 1)\n", + ").reshape(countries_norm[\"Population2020\"].shape)\n", + "\n", + "countries_norm[\n", + " [\n", + " \"Country\",\n", + " \"Population2020\",\n", + " \"PopulationStand\",\n", + " \"PopulationClipStand\",\n", + " \"PopulationWinsorizedStand\",\n", + " ]\n", + "]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 3899a43..2f5cecc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -467,6 +467,17 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "cloudpickle" +version = "3.1.0" +description = "Pickler class to extend the standard pickle.Pickler functionality" +optional = false +python-versions = ">=3.8" +files = [ + {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"}, + {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"}, +] + [[package]] name = "colorama" version = "0.4.6" @@ -675,6 +686,41 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "featuretools" +version = "1.31.0" +description = "a framework for automated feature engineering" +optional = false +python-versions = "<4,>=3.9" +files = [ + {file = "featuretools-1.31.0-py3-none-any.whl", hash = "sha256:87c94e9ae959c89acd83da96bd2583f3ef0f6daaa9639cbb6e46dbde2c742a18"}, + {file = "featuretools-1.31.0.tar.gz", hash = "sha256:01bfb17fcc1715b4c3623c7bc94a8982122c4a0fa03350ed478601bb81f90155"}, +] + +[package.dependencies] +cloudpickle = ">=1.5.0" +holidays = ">=0.17" +numpy = ">=1.25.0" +packaging = ">=20.0" +pandas = ">=2.0.0" +psutil = ">=5.7.0" +scipy = ">=1.10.0" +tqdm = ">=4.66.3" +woodwork = ">=0.28.0" + +[package.extras] +autonormalize = ["autonormalize (>=2.0.1)"] +complete = ["featuretools[dask,nlp,premium]"] +dask = ["dask[dataframe] (>=2023.2.0)", "distributed (>=2023.2.0)"] +dev = ["black[jupyter] (>=23.1.0)", "featuretools[dask,docs,test]", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)"] +docs = ["Sphinx (==5.1.1)", "autonormalize (>=2.0.1)", "click (>=7.0.0)", "featuretools[dask,test]", "ipython (==8.4.0)", "jupyter (==1.0.0)", "jupyter-client (>=8.0.2)", "matplotlib (==3.7.2)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"] +nlp = ["nlp-primitives (>=2.12.0)"] +premium = ["premium-primitives (>=0.0.3)"] +sklearn = ["featuretools-sklearn-transformer (>=1.0.0)"] +sql = ["featuretools-sql (>=0.0.1)", "psycopg2-binary (>=2.9.3)"] +test = ["boto3 (>=1.34.32)", "composeml (>=0.8.0)", "graphviz (>=0.8.4)", "moto[all] (>=5.0.0)", "pip (>=23.3.0)", "pyarrow (>=14.0.1)", "pympler (>=0.8)", "pytest (>=7.1.2)", "pytest-cov (>=3.0.0)", "pytest-timeout (>=2.1.0)", "pytest-xdist (>=2.5.0)", "smart-open (>=5.0.0)", "urllib3 (>=1.26.18)"] +tsfresh = ["featuretools-tsfresh-primitives (>=1.0.0)"] + [[package]] name = "flask" version = "3.0.3" @@ -833,6 +879,20 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "holidays" +version = "0.59" +description = "World Holidays Framework" +optional = false +python-versions = ">=3.9" +files = [ + {file = "holidays-0.59-py3-none-any.whl", hash = "sha256:4576ec7aaad7cd66463236c110bcbd533ac7e739e0e9d3cbeccf8107384a8a92"}, + {file = "holidays-0.59.tar.gz", hash = "sha256:c5cd2e1c0c27a64217b10faf2e8fcc224f5bb64087b56b70c6aff21f6379e6e4"}, +] + +[package.dependencies] +python-dateutil = "*" + [[package]] name = "httpcore" version = "1.0.5" @@ -914,6 +974,25 @@ examples = ["keras (>=2.4.3)", "matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "seab optional = ["keras (>=2.4.3)", "pandas (>=1.0.5)", "tensorflow (>=2.4.3)"] tests = ["black (>=23.3.0)", "flake8 (>=3.8.2)", "keras (>=2.4.3)", "mypy (>=1.3.0)", "pandas (>=1.0.5)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "tensorflow (>=2.4.3)"] +[[package]] +name = "importlib-resources" +version = "6.4.5" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_resources-6.4.5-py3-none-any.whl", hash = "sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717"}, + {file = "importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] +type = ["pytest-mypy"] + [[package]] name = "ipykernel" version = "6.29.5" @@ -2708,6 +2787,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -2939,6 +3023,26 @@ files = [ {file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"}, ] +[[package]] +name = "tqdm" +version = "4.66.5" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"}, + {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.14.3" @@ -3110,7 +3214,33 @@ files = [ {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"}, ] +[[package]] +name = "woodwork" +version = "0.31.0" +description = "a data typing library for machine learning" +optional = false +python-versions = "<4,>=3.9" +files = [ + {file = "woodwork-0.31.0-py3-none-any.whl", hash = "sha256:5cb3370553b5f466f8c8599b1bf559584dc0b798cc1f2da26bbd7029d256c6f9"}, + {file = "woodwork-0.31.0.tar.gz", hash = "sha256:6ef82af1d5b6525b02efe6417c574c810cfdcc606cb266bd0d7fb17a1d066b67"}, +] + +[package.dependencies] +importlib-resources = ">=5.10.0" +numpy = ">=1.25.0" +pandas = ">=2.0.0" +python-dateutil = ">=2.8.2" +scikit-learn = ">=1.1.0" +scipy = ">=1.10.0" + +[package.extras] +complete = ["woodwork[updater]"] +dev = ["click (>=8.1.7)", "pre-commit (>=2.20.0)", "ruff (>=0.1.6)", "woodwork[docs,test]"] +docs = ["Sphinx (==5.1.1)", "ipython (==8.4.0)", "jupyter (==1.0.0)", "myst-parser (==0.18.0)", "nbconvert (==6.5.0)", "nbsphinx (==0.8.9)", "pyarrow (>=14.0.1)", "pydata-sphinx-theme (==0.9.0)", "sphinx-copybutton (==0.5.0)", "sphinx-inline-tabs (==2022.1.2b11)"] +test = ["boto3 (>=1.34.32)", "moto[all] (>=5.0.0)", "pyarrow (>=14.0.1)", "pytest (>=7.0.1)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=2.1.0)", "smart-open (>=5.0.0)"] +updater = ["alteryx-open-src-update-checker (>=3.1.0)"] + [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "a7e3d516bde2d6e4173d8a9770fb5337a0c806dadaeda355084b262c1995f7ea" +content-hash = "09433ce7624fd6af995c85e9e980c57cd417491975e280f0a844931df35e5085" diff --git a/pyproject.toml b/pyproject.toml index 0a91b71..ae221d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,12 @@ apiflask = "^2.2.0" flask-cors = "^5.0.0" scikit-learn = "^1.5.2" imbalanced-learn = "^0.12.3" +featuretools = "^1.31.0" +[tool.poetry.group.dev.dependencies] +ipykernel = "^6.29.5" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api"