diff --git a/lec3.ipynb b/lec3.ipynb
index 40f4903..9c7cf09 100644
--- a/lec3.ipynb
+++ b/lec3.ipynb
@@ -18,9 +18,268 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Population 2020 | \n",
+ " Yearly Change | \n",
+ " Net Change | \n",
+ " Density(P/Km²) | \n",
+ " Land Area (Km²) | \n",
+ " Migrants (net) | \n",
+ " Fert. Rate | \n",
+ " MedAge | \n",
+ " Urban Pop % | \n",
+ " World Share | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 1439323776 | \n",
+ " 0.39 | \n",
+ " 5540090 | \n",
+ " 153 | \n",
+ " 9388211 | \n",
+ " -348,399 | \n",
+ " 1.7 | \n",
+ " 38 | \n",
+ " 61% | \n",
+ " 18.47% | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 1380004385 | \n",
+ " 0.99 | \n",
+ " 13586631 | \n",
+ " 464 | \n",
+ " 2973190 | \n",
+ " -532,687 | \n",
+ " 2.2 | \n",
+ " 28 | \n",
+ " 35% | \n",
+ " 17.70% | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 331002651 | \n",
+ " 0.59 | \n",
+ " 1937734 | \n",
+ " 36 | \n",
+ " 9147420 | \n",
+ " 954,806 | \n",
+ " 1.8 | \n",
+ " 38 | \n",
+ " 83% | \n",
+ " 4.25% | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 273523615 | \n",
+ " 1.07 | \n",
+ " 2898047 | \n",
+ " 151 | \n",
+ " 1811570 | \n",
+ " -98,955 | \n",
+ " 2.3 | \n",
+ " 30 | \n",
+ " 56% | \n",
+ " 3.51% | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 220892340 | \n",
+ " 2.00 | \n",
+ " 4327022 | \n",
+ " 287 | \n",
+ " 770880 | \n",
+ " -233,379 | \n",
+ " 3.6 | \n",
+ " 23 | \n",
+ " 35% | \n",
+ " 2.83% | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " Montserrat | \n",
+ " 4992 | \n",
+ " 0.06 | \n",
+ " 3 | \n",
+ " 50 | \n",
+ " 100 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 10% | \n",
+ " 0.00% | \n",
+ "
\n",
+ " \n",
+ " 232 | \n",
+ " Falkland Islands | \n",
+ " 3480 | \n",
+ " 3.05 | \n",
+ " 103 | \n",
+ " 0 | \n",
+ " 12170 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 66% | \n",
+ " 0.00% | \n",
+ "
\n",
+ " \n",
+ " 233 | \n",
+ " Niue | \n",
+ " 1626 | \n",
+ " 0.68 | \n",
+ " 11 | \n",
+ " 6 | \n",
+ " 260 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 46% | \n",
+ " 0.00% | \n",
+ "
\n",
+ " \n",
+ " 234 | \n",
+ " Tokelau | \n",
+ " 1357 | \n",
+ " 1.27 | \n",
+ " 17 | \n",
+ " 136 | \n",
+ " 10 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 0% | \n",
+ " 0.00% | \n",
+ "
\n",
+ " \n",
+ " 235 | \n",
+ " Holy See | \n",
+ " 801 | \n",
+ " 0.25 | \n",
+ " 2 | \n",
+ " 2,003 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " N.A. | \n",
+ " 0.00% | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
235 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
+ "no \n",
+ "1 China 1439323776 0.39 5540090 \n",
+ "2 India 1380004385 0.99 13586631 \n",
+ "3 United States 331002651 0.59 1937734 \n",
+ "4 Indonesia 273523615 1.07 2898047 \n",
+ "5 Pakistan 220892340 2.00 4327022 \n",
+ ".. ... ... ... ... \n",
+ "231 Montserrat 4992 0.06 3 \n",
+ "232 Falkland Islands 3480 3.05 103 \n",
+ "233 Niue 1626 0.68 11 \n",
+ "234 Tokelau 1357 1.27 17 \n",
+ "235 Holy See 801 0.25 2 \n",
+ "\n",
+ " Density(P/Km²) Land Area (Km²) Migrants (net) Fert. Rate MedAge \\\n",
+ "no \n",
+ "1 153 9388211 -348,399 1.7 38 \n",
+ "2 464 2973190 -532,687 2.2 28 \n",
+ "3 36 9147420 954,806 1.8 38 \n",
+ "4 151 1811570 -98,955 2.3 30 \n",
+ "5 287 770880 -233,379 3.6 23 \n",
+ ".. ... ... ... ... ... \n",
+ "231 50 100 NaN N.A. N.A. \n",
+ "232 0 12170 NaN N.A. N.A. \n",
+ "233 6 260 NaN N.A. N.A. \n",
+ "234 136 10 NaN N.A. N.A. \n",
+ "235 2,003 0 NaN N.A. N.A. \n",
+ "\n",
+ " Urban Pop % World Share \n",
+ "no \n",
+ "1 61% 18.47% \n",
+ "2 35% 17.70% \n",
+ "3 83% 4.25% \n",
+ "4 56% 3.51% \n",
+ "5 35% 2.83% \n",
+ ".. ... ... \n",
+ "231 10% 0.00% \n",
+ "232 66% 0.00% \n",
+ "233 46% 0.00% \n",
+ "234 0% 0.00% \n",
+ "235 N.A. 0.00% \n",
+ "\n",
+ "[235 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"import pandas as pd\n",
"\n",
@@ -59,7 +318,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -86,7 +345,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -111,7 +370,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -121,9 +380,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0. , 5458956.66666667, 10917913.33333333,\n",
+ " 16376870. ]),\n",
+ " array([229, 5, 1]))"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"hist1, bins1 = np.histogram(\n",
" countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins=num_bins\n",
@@ -133,9 +405,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " (5458956.667, 10917913.333] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " (5458956.667, 10917913.333] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " (5458956.667, 10917913.333] | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " (10917913.333, 16376870.0] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " (0.0, 5458956.667] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 (5458956.667, 10917913.333]\n",
+ "2 India 2973190 (0.0, 5458956.667]\n",
+ "3 United States 9147420 (5458956.667, 10917913.333]\n",
+ "4 Indonesia 1811570 (0.0, 5458956.667]\n",
+ "5 Pakistan 770880 (0.0, 5458956.667]\n",
+ "6 Brazil 8358140 (5458956.667, 10917913.333]\n",
+ "7 Nigeria 910770 (0.0, 5458956.667]\n",
+ "8 Bangladesh 130170 (0.0, 5458956.667]\n",
+ "9 Russia 16376870 (10917913.333, 16376870.0]\n",
+ "10 Mexico 1943950 (0.0, 5458956.667]\n",
+ "11 Japan 364555 (0.0, 5458956.667]\n",
+ "12 Ethiopia 1000000 (0.0, 5458956.667]\n",
+ "13 Philippines 298170 (0.0, 5458956.667]\n",
+ "14 Egypt 995450 (0.0, 5458956.667]\n",
+ "15 Vietnam 310070 (0.0, 5458956.667]\n",
+ "16 DR Congo 2267050 (0.0, 5458956.667]\n",
+ "17 Turkey 769630 (0.0, 5458956.667]\n",
+ "18 Iran 1628550 (0.0, 5458956.667]\n",
+ "19 Germany 348560 (0.0, 5458956.667]\n",
+ "20 Thailand 510890 (0.0, 5458956.667]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -149,9 +608,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 Middle\n",
+ "2 India 2973190 Small\n",
+ "3 United States 9147420 Middle\n",
+ "4 Indonesia 1811570 Small\n",
+ "5 Pakistan 770880 Small\n",
+ "6 Brazil 8358140 Middle\n",
+ "7 Nigeria 910770 Small\n",
+ "8 Bangladesh 130170 Small\n",
+ "9 Russia 16376870 Big\n",
+ "10 Mexico 1943950 Small\n",
+ "11 Japan 364555 Small\n",
+ "12 Ethiopia 1000000 Small\n",
+ "13 Philippines 298170 Small\n",
+ "14 Egypt 995450 Small\n",
+ "15 Vietnam 310070 Small\n",
+ "16 DR Congo 2267050 Small\n",
+ "17 Turkey 769630 Small\n",
+ "18 Iran 1628550 Small\n",
+ "19 Germany 348560 Small\n",
+ "20 Thailand 510890 Small"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -167,14 +813,26 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)"
+ "Равномерное разделение данных на 4 группы c установкой собственной границы диапазона значений (от 0 до 12000000) просто ставим нименьшее и наибольшее и ставим колво групп"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0., 4000000., 8000000., 12000000.]),\n",
+ " array([229, 1, 4, 1]))"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"labels = [\"Small\", \"Middle\", \"Big\"]\n",
"bins2 = np.linspace(0, 12000000, 4)\n",
@@ -190,9 +848,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " (8000000.0, 12000000.0] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " (8000000.0, 12000000.0] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " (8000000.0, 12000000.0] | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " (0.0, 4000000.0] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 (8000000.0, 12000000.0]\n",
+ "2 India 2973190 (0.0, 4000000.0]\n",
+ "3 United States 9147420 (8000000.0, 12000000.0]\n",
+ "4 Indonesia 1811570 (0.0, 4000000.0]\n",
+ "5 Pakistan 770880 (0.0, 4000000.0]\n",
+ "6 Brazil 8358140 (8000000.0, 12000000.0]\n",
+ "7 Nigeria 910770 (0.0, 4000000.0]\n",
+ "8 Bangladesh 130170 (0.0, 4000000.0]\n",
+ "9 Russia 16376870 NaN\n",
+ "10 Mexico 1943950 (0.0, 4000000.0]\n",
+ "11 Japan 364555 (0.0, 4000000.0]\n",
+ "12 Ethiopia 1000000 (0.0, 4000000.0]\n",
+ "13 Philippines 298170 (0.0, 4000000.0]\n",
+ "14 Egypt 995450 (0.0, 4000000.0]\n",
+ "15 Vietnam 310070 (0.0, 4000000.0]\n",
+ "16 DR Congo 2267050 (0.0, 4000000.0]\n",
+ "17 Turkey 769630 (0.0, 4000000.0]\n",
+ "18 Iran 1628550 (0.0, 4000000.0]\n",
+ "19 Germany 348560 (0.0, 4000000.0]\n",
+ "20 Thailand 510890 (0.0, 4000000.0]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -206,9 +1051,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " Small | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 Big\n",
+ "2 India 2973190 Small\n",
+ "3 United States 9147420 Big\n",
+ "4 Indonesia 1811570 Small\n",
+ "5 Pakistan 770880 Small\n",
+ "6 Brazil 8358140 Big\n",
+ "7 Nigeria 910770 Small\n",
+ "8 Bangladesh 130170 Small\n",
+ "9 Russia 16376870 NaN\n",
+ "10 Mexico 1943950 Small\n",
+ "11 Japan 364555 Small\n",
+ "12 Ethiopia 1000000 Small\n",
+ "13 Philippines 298170 Small\n",
+ "14 Egypt 995450 Small\n",
+ "15 Vietnam 310070 Small\n",
+ "16 DR Congo 2267050 Small\n",
+ "17 Turkey 769630 Small\n",
+ "18 Iran 1628550 Small\n",
+ "19 Germany 348560 Small\n",
+ "20 Thailand 510890 Small"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -224,14 +1256,26 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)"
+ "Равномерное разделение данных на 5 групп c установкой собственных интервалов (0 - 1000, 1000 - 100000, 100000 - 500000, 500000 - 3000000, 3000000 И БОЛЕЕ)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n",
+ " array([52, 77, 56, 44, 6]))"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n",
"hist3, bins3 = np.histogram(\n",
@@ -245,9 +1289,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " (3000000.0, inf] | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " (100000.0, 500000.0] | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " (500000.0, 3000000.0] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 (3000000.0, inf]\n",
+ "2 India 2973190 (500000.0, 3000000.0]\n",
+ "3 United States 9147420 (3000000.0, inf]\n",
+ "4 Indonesia 1811570 (500000.0, 3000000.0]\n",
+ "5 Pakistan 770880 (500000.0, 3000000.0]\n",
+ "6 Brazil 8358140 (3000000.0, inf]\n",
+ "7 Nigeria 910770 (500000.0, 3000000.0]\n",
+ "8 Bangladesh 130170 (100000.0, 500000.0]\n",
+ "9 Russia 16376870 (3000000.0, inf]\n",
+ "10 Mexico 1943950 (500000.0, 3000000.0]\n",
+ "11 Japan 364555 (100000.0, 500000.0]\n",
+ "12 Ethiopia 1000000 (500000.0, 3000000.0]\n",
+ "13 Philippines 298170 (100000.0, 500000.0]\n",
+ "14 Egypt 995450 (500000.0, 3000000.0]\n",
+ "15 Vietnam 310070 (100000.0, 500000.0]\n",
+ "16 DR Congo 2267050 (500000.0, 3000000.0]\n",
+ "17 Turkey 769630 (500000.0, 3000000.0]\n",
+ "18 Iran 1628550 (500000.0, 3000000.0]\n",
+ "19 Germany 348560 (100000.0, 500000.0]\n",
+ "20 Thailand 510890 (500000.0, 3000000.0]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -261,9 +1492,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 Giant\n",
+ "2 India 2973190 Big\n",
+ "3 United States 9147420 Giant\n",
+ "4 Indonesia 1811570 Big\n",
+ "5 Pakistan 770880 Big\n",
+ "6 Brazil 8358140 Giant\n",
+ "7 Nigeria 910770 Big\n",
+ "8 Bangladesh 130170 Middle\n",
+ "9 Russia 16376870 Giant\n",
+ "10 Mexico 1943950 Big\n",
+ "11 Japan 364555 Middle\n",
+ "12 Ethiopia 1000000 Big\n",
+ "13 Philippines 298170 Middle\n",
+ "14 Egypt 995450 Big\n",
+ "15 Vietnam 310070 Middle\n",
+ "16 DR Congo 2267050 Big\n",
+ "17 Turkey 769630 Big\n",
+ "18 Iran 1628550 Big\n",
+ "19 Germany 348560 Middle\n",
+ "20 Thailand 510890 Big"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -284,9 +1702,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 4\n",
+ "2 India 2973190 4\n",
+ "3 United States 9147420 4\n",
+ "4 Indonesia 1811570 4\n",
+ "5 Pakistan 770880 4\n",
+ "6 Brazil 8358140 4\n",
+ "7 Nigeria 910770 4\n",
+ "8 Bangladesh 130170 2\n",
+ "9 Russia 16376870 4\n",
+ "10 Mexico 1943950 4\n",
+ "11 Japan 364555 3\n",
+ "12 Ethiopia 1000000 4\n",
+ "13 Philippines 298170 3\n",
+ "14 Egypt 995450 4\n",
+ "15 Vietnam 310070 3\n",
+ "16 DR Congo 2267050 4\n",
+ "17 Turkey 769630 4\n",
+ "18 Iran 1628550 4\n",
+ "19 Germany 348560 3\n",
+ "20 Thailand 510890 3"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -300,9 +1905,196 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country (or dependency) | \n",
+ " Land Area (Km²) | \n",
+ " Land Area (Km²) | \n",
+ "
\n",
+ " \n",
+ " no | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " China | \n",
+ " 9388211 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " India | \n",
+ " 2973190 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " United States | \n",
+ " 9147420 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Indonesia | \n",
+ " 1811570 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Pakistan | \n",
+ " 770880 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Brazil | \n",
+ " 8358140 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Nigeria | \n",
+ " 910770 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Bangladesh | \n",
+ " 130170 | \n",
+ " Middle | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Russia | \n",
+ " 16376870 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Mexico | \n",
+ " 1943950 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Japan | \n",
+ " 364555 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Ethiopia | \n",
+ " 1000000 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Philippines | \n",
+ " 298170 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Egypt | \n",
+ " 995450 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Vietnam | \n",
+ " 310070 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " DR Congo | \n",
+ " 2267050 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Turkey | \n",
+ " 769630 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Iran | \n",
+ " 1628550 | \n",
+ " Giant | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Germany | \n",
+ " 348560 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Thailand | \n",
+ " 510890 | \n",
+ " Big | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Country (or dependency) Land Area (Km²) Land Area (Km²)\n",
+ "no \n",
+ "1 China 9388211 Giant\n",
+ "2 India 2973190 Giant\n",
+ "3 United States 9147420 Giant\n",
+ "4 Indonesia 1811570 Giant\n",
+ "5 Pakistan 770880 Giant\n",
+ "6 Brazil 8358140 Giant\n",
+ "7 Nigeria 910770 Giant\n",
+ "8 Bangladesh 130170 Middle\n",
+ "9 Russia 16376870 Giant\n",
+ "10 Mexico 1943950 Giant\n",
+ "11 Japan 364555 Big\n",
+ "12 Ethiopia 1000000 Giant\n",
+ "13 Philippines 298170 Big\n",
+ "14 Egypt 995450 Giant\n",
+ "15 Vietnam 310070 Big\n",
+ "16 DR Congo 2267050 Giant\n",
+ "17 Turkey 769630 Giant\n",
+ "18 Iran 1628550 Giant\n",
+ "19 Germany 348560 Big\n",
+ "20 Thailand 510890 Big"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pd.concat(\n",
" [\n",
@@ -329,7 +2121,7 @@
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -364,16 +2156,12 @@
"source": [
"#### Загрузка данных\n",
"\n",
- "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n",
- "\n",
- "Используется только 100 первых заказов и связанные с ними объекты\n",
- "\n",
- "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis"
+ "приведение даннык к нормальному виду\n"
]
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -430,7 +2218,7 @@
" [234 rows x 3 columns])"
]
},
- "execution_count": 32,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -481,7 +2269,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
@@ -506,7 +2294,7 @@
" No relationships"
]
},
- "execution_count": 34,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -558,7 +2346,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
@@ -573,7 +2361,7 @@
" countries.Country (or dependency) -> capitals.Country/Territory"
]
},
- "execution_count": 35,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -599,7 +2387,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
@@ -790,7 +2578,7 @@
"[235 rows x 7 columns]"
]
},
- "execution_count": 36,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -816,7 +2604,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -831,7 +2619,7 @@
" ]"
]
},
- "execution_count": 37,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -856,7 +2644,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
@@ -865,7 +2653,7 @@
""
]
},
- "execution_count": 38,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
},
@@ -893,7 +2681,7 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -1141,16 +2929,16 @@
"29 Colombia 50882891 50000000"
]
},
- "execution_count": 40,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"countries_norm = countries.copy()\n",
- "\n",
+ "#заменяем все значения больше 50000000 на 50000000\n",
"countries_norm[\"Population Clip\"] = countries_norm[\"Population 2020\"].clip(0, 50000000);\n",
- "\n",
+ "#проверка результата\n",
"countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n",
" [\"Country (or dependency)\", \"Population 2020\", \"Population Clip\"]\n",
"]"
@@ -1160,12 +2948,16 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Винсоризация признака Возраст"
+ "Винсоризация \n",
+ " * `winsorize()`: Функция для обработки выбросов с помощью Winsorization. \n",
+ " * `countries_norm[\"Population 2020\"].fillna(countries_norm[\"Population 2020\"].mean())`: Заменяет пропущенные значения в столбце \"Population 2020\" средним значением этого столбца.\n",
+ " * `(0, 0.05)`: Указывает, что нужно обработать как нижние, так и верхние выбросы. 0.05 означает, что 5% самых маленьких и 5% самых больших значений в столбце \"Population 2020\" будут заменены на значения 5-го и 95-го процентилей соответственно. \n",
+ " * `inplace=False`: Указывает, что `winsorize` не должен модифицировать исходный датафрейм `countries_norm` напрямую, а создать новый столбец с обработанными данными."
]
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
@@ -1420,7 +3212,7 @@
"29 Colombia 50882891 50882891"
]
},
- "execution_count": 41,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -1450,7 +3242,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -1628,7 +3420,7 @@
"[235 rows x 6 columns]"
]
},
- "execution_count": 43,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -1677,7 +3469,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -1842,7 +3634,7 @@
"[235 rows x 5 columns]"
]
},
- "execution_count": 44,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/lec4.ipynb b/lec4.ipynb
new file mode 100644
index 0000000..1eeba17
--- /dev/null
+++ b/lec4.ipynb
@@ -0,0 +1,2524 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Загрузка набора данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Price | \n",
+ " Levy | \n",
+ " Manufacturer | \n",
+ " Model | \n",
+ " Prod_year | \n",
+ " Category | \n",
+ " Leather_interior | \n",
+ " Fuel type | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Gear box type | \n",
+ " Drive wheels | \n",
+ " Doors | \n",
+ " Wheel | \n",
+ " Color | \n",
+ " Airbags | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 45654403 | \n",
+ " 13328 | \n",
+ " 1399 | \n",
+ " LEXUS | \n",
+ " RX 450 | \n",
+ " 2010 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 3.5 | \n",
+ " 186005 km | \n",
+ " 6.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 44731507 | \n",
+ " 16621 | \n",
+ " 1018 | \n",
+ " CHEVROLET | \n",
+ " Equinox | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 3 | \n",
+ " 192000 km | \n",
+ " 6.0 | \n",
+ " Tiptronic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 45774419 | \n",
+ " 8467 | \n",
+ " - | \n",
+ " HONDA | \n",
+ " FIT | \n",
+ " 2006 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 1.3 | \n",
+ " 200000 km | \n",
+ " 4.0 | \n",
+ " Variator | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Right-hand drive | \n",
+ " Black | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 45769185 | \n",
+ " 3607 | \n",
+ " 862 | \n",
+ " FORD | \n",
+ " Escape | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 2.5 | \n",
+ " 168966 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 45809263 | \n",
+ " 11726 | \n",
+ " 446 | \n",
+ " HONDA | \n",
+ " FIT | \n",
+ " 2014 | \n",
+ " Hatchback | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 1.3 | \n",
+ " 91901 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45798355 | \n",
+ " 8467 | \n",
+ " - | \n",
+ " MERCEDES-BENZ | \n",
+ " CLK 200 | \n",
+ " 1999 | \n",
+ " Coupe | \n",
+ " Yes | \n",
+ " CNG | \n",
+ " 2.0 Turbo | \n",
+ " 300000 km | \n",
+ " 4.0 | \n",
+ " Manual | \n",
+ " Rear | \n",
+ " 02-Mar | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 45778856 | \n",
+ " 15681 | \n",
+ " 831 | \n",
+ " HYUNDAI | \n",
+ " Sonata | \n",
+ " 2011 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 2.4 | \n",
+ " 161600 km | \n",
+ " 4.0 | \n",
+ " Tiptronic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Red | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 45804997 | \n",
+ " 26108 | \n",
+ " 836 | \n",
+ " HYUNDAI | \n",
+ " Tucson | \n",
+ " 2010 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Diesel | \n",
+ " 2 | \n",
+ " 116365 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Grey | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 45793526 | \n",
+ " 5331 | \n",
+ " 1288 | \n",
+ " CHEVROLET | \n",
+ " Captiva | \n",
+ " 2007 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Diesel | \n",
+ " 2 | \n",
+ " 51258 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 45813273 | \n",
+ " 470 | \n",
+ " 753 | \n",
+ " HYUNDAI | \n",
+ " Sonata | \n",
+ " 2012 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 2.4 | \n",
+ " 186923 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
19237 rows × 17 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Price Levy Manufacturer Model Prod_year Category \\\n",
+ "ID \n",
+ "45654403 13328 1399 LEXUS RX 450 2010 Jeep \n",
+ "44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
+ "45774419 8467 - HONDA FIT 2006 Hatchback \n",
+ "45769185 3607 862 FORD Escape 2011 Jeep \n",
+ "45809263 11726 446 HONDA FIT 2014 Hatchback \n",
+ "... ... ... ... ... ... ... \n",
+ "45798355 8467 - MERCEDES-BENZ CLK 200 1999 Coupe \n",
+ "45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n",
+ "45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n",
+ "45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
+ "45813273 470 753 HYUNDAI Sonata 2012 Sedan \n",
+ "\n",
+ " Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
+ "ID \n",
+ "45654403 Yes Hybrid 3.5 186005 km 6.0 \n",
+ "44731507 No Petrol 3 192000 km 6.0 \n",
+ "45774419 No Petrol 1.3 200000 km 4.0 \n",
+ "45769185 Yes Hybrid 2.5 168966 km 4.0 \n",
+ "45809263 Yes Petrol 1.3 91901 km 4.0 \n",
+ "... ... ... ... ... ... \n",
+ "45798355 Yes CNG 2.0 Turbo 300000 km 4.0 \n",
+ "45778856 Yes Petrol 2.4 161600 km 4.0 \n",
+ "45804997 Yes Diesel 2 116365 km 4.0 \n",
+ "45793526 Yes Diesel 2 51258 km 4.0 \n",
+ "45813273 Yes Hybrid 2.4 186923 km 4.0 \n",
+ "\n",
+ " Gear box type Drive wheels Doors Wheel Color Airbags \n",
+ "ID \n",
+ "45654403 Automatic 4x4 04-May Left wheel Silver 12 \n",
+ "44731507 Tiptronic 4x4 04-May Left wheel Black 8 \n",
+ "45774419 Variator Front 04-May Right-hand drive Black 2 \n",
+ "45769185 Automatic 4x4 04-May Left wheel White 0 \n",
+ "45809263 Automatic Front 04-May Left wheel Silver 4 \n",
+ "... ... ... ... ... ... ... \n",
+ "45798355 Manual Rear 02-Mar Left wheel Silver 5 \n",
+ "45778856 Tiptronic Front 04-May Left wheel Red 8 \n",
+ "45804997 Automatic Front 04-May Left wheel Grey 4 \n",
+ "45793526 Automatic Front 04-May Left wheel Black 4 \n",
+ "45813273 Automatic Front 04-May Left wheel White 12 \n",
+ "\n",
+ "[19237 rows x 17 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "\n",
+ "random_state=9\n",
+ "\n",
+ "df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n",
+ "\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n",
+ "\n",
+ "Целевой признак -- gear box type - коробка переключения передач. x - полная выборка, y - gear box столбец\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Price | \n",
+ " Levy | \n",
+ " Manufacturer | \n",
+ " Model | \n",
+ " Prod_year | \n",
+ " Category | \n",
+ " Leather_interior | \n",
+ " Fuel type | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Gear box type | \n",
+ " Drive wheels | \n",
+ " Doors | \n",
+ " Wheel | \n",
+ " Color | \n",
+ " Airbags | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 45758153 | \n",
+ " 1333 | \n",
+ " 289 | \n",
+ " FORD | \n",
+ " Escape | \n",
+ " 2008 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 0.4 | \n",
+ " 349288 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Blue | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 45699930 | \n",
+ " 17249 | \n",
+ " - | \n",
+ " FORD | \n",
+ " Escape Hybrid | \n",
+ " 2008 | \n",
+ " Jeep | \n",
+ " No | \n",
+ " Hybrid | \n",
+ " 2.3 | \n",
+ " 147000 km | \n",
+ " 4.0 | \n",
+ " Variator | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 45646562 | \n",
+ " 1333 | \n",
+ " 1053 | \n",
+ " LEXUS | \n",
+ " ES 350 | \n",
+ " 2014 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 3.5 | \n",
+ " 179358 km | \n",
+ " 6.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Red | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 45656923 | \n",
+ " 9879 | \n",
+ " 1018 | \n",
+ " MERCEDES-BENZ | \n",
+ " ML 350 | \n",
+ " 2011 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Diesel | \n",
+ " 3 | \n",
+ " 275862 km | \n",
+ " 6.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 45815887 | \n",
+ " 10976 | \n",
+ " 1275 | \n",
+ " HYUNDAI | \n",
+ " Sonata | \n",
+ " 2019 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 2.4 | \n",
+ " 29419 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Blue | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45802363 | \n",
+ " 21805 | \n",
+ " 1024 | \n",
+ " HYUNDAI | \n",
+ " H1 | \n",
+ " 2010 | \n",
+ " Minivan | \n",
+ " Yes | \n",
+ " Diesel | \n",
+ " 2.5 | \n",
+ " 58958 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 45812777 | \n",
+ " 220 | \n",
+ " 1327 | \n",
+ " TOYOTA | \n",
+ " Camry | \n",
+ " 2018 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 2.5 | \n",
+ " 47688 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Blue | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 44104417 | \n",
+ " 15210 | \n",
+ " - | \n",
+ " TOYOTA | \n",
+ " Aqua | \n",
+ " 2014 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " Hybrid | \n",
+ " 1.5 | \n",
+ " 139000 km | \n",
+ " 4.0 | \n",
+ " Variator | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Right-hand drive | \n",
+ " White | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 45793406 | \n",
+ " 3136 | \n",
+ " - | \n",
+ " OPEL | \n",
+ " Corsa | \n",
+ " 1995 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " Petrol | \n",
+ " 1.4 | \n",
+ " 100000 km | \n",
+ " 4.0 | \n",
+ " Manual | \n",
+ " Front | \n",
+ " 02-Mar | \n",
+ " Left wheel | \n",
+ " Grey | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 45700700 | \n",
+ " 18817 | \n",
+ " - | \n",
+ " TOYOTA | \n",
+ " Camry | \n",
+ " 2007 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Hybrid | \n",
+ " 2.4 | \n",
+ " 151000 km | \n",
+ " 4.0 | \n",
+ " Variator | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
15389 rows × 17 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Price Levy Manufacturer Model Prod_year Category \\\n",
+ "ID \n",
+ "45758153 1333 289 FORD Escape 2008 Jeep \n",
+ "45699930 17249 - FORD Escape Hybrid 2008 Jeep \n",
+ "45646562 1333 1053 LEXUS ES 350 2014 Sedan \n",
+ "45656923 9879 1018 MERCEDES-BENZ ML 350 2011 Jeep \n",
+ "45815887 10976 1275 HYUNDAI Sonata 2019 Sedan \n",
+ "... ... ... ... ... ... ... \n",
+ "45802363 21805 1024 HYUNDAI H1 2010 Minivan \n",
+ "45812777 220 1327 TOYOTA Camry 2018 Sedan \n",
+ "44104417 15210 - TOYOTA Aqua 2014 Hatchback \n",
+ "45793406 3136 - OPEL Corsa 1995 Hatchback \n",
+ "45700700 18817 - TOYOTA Camry 2007 Sedan \n",
+ "\n",
+ " Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
+ "ID \n",
+ "45758153 Yes Hybrid 0.4 349288 km 4.0 \n",
+ "45699930 No Hybrid 2.3 147000 km 4.0 \n",
+ "45646562 Yes Petrol 3.5 179358 km 6.0 \n",
+ "45656923 Yes Diesel 3 275862 km 6.0 \n",
+ "45815887 Yes Petrol 2.4 29419 km 4.0 \n",
+ "... ... ... ... ... ... \n",
+ "45802363 Yes Diesel 2.5 58958 km 4.0 \n",
+ "45812777 Yes Petrol 2.5 47688 km 4.0 \n",
+ "44104417 No Hybrid 1.5 139000 km 4.0 \n",
+ "45793406 No Petrol 1.4 100000 km 4.0 \n",
+ "45700700 Yes Hybrid 2.4 151000 km 4.0 \n",
+ "\n",
+ " Gear box type Drive wheels Doors Wheel Color Airbags \n",
+ "ID \n",
+ "45758153 Automatic Front 04-May Left wheel Blue 0 \n",
+ "45699930 Variator 4x4 04-May Left wheel White 8 \n",
+ "45646562 Automatic Front 04-May Left wheel Red 12 \n",
+ "45656923 Automatic 4x4 04-May Left wheel Silver 12 \n",
+ "45815887 Automatic Front 04-May Left wheel Blue 12 \n",
+ "... ... ... ... ... ... ... \n",
+ "45802363 Automatic Front 04-May Left wheel Black 4 \n",
+ "45812777 Automatic Front 04-May Left wheel Blue 12 \n",
+ "44104417 Variator Front 04-May Right-hand drive White 2 \n",
+ "45793406 Manual Front 02-Mar Left wheel Grey 2 \n",
+ "45700700 Variator Front 04-May Left wheel Black 10 \n",
+ "\n",
+ "[15389 rows x 17 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Gear box type | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 45758153 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45699930 | \n",
+ " Variator | \n",
+ "
\n",
+ " \n",
+ " 45646562 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45656923 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45815887 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45802363 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45812777 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 44104417 | \n",
+ " Variator | \n",
+ "
\n",
+ " \n",
+ " 45793406 | \n",
+ " Manual | \n",
+ "
\n",
+ " \n",
+ " 45700700 | \n",
+ " Variator | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
15389 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Gear box type\n",
+ "ID \n",
+ "45758153 Automatic\n",
+ "45699930 Variator\n",
+ "45646562 Automatic\n",
+ "45656923 Automatic\n",
+ "45815887 Automatic\n",
+ "... ...\n",
+ "45802363 Automatic\n",
+ "45812777 Automatic\n",
+ "44104417 Variator\n",
+ "45793406 Manual\n",
+ "45700700 Variator\n",
+ "\n",
+ "[15389 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Price | \n",
+ " Levy | \n",
+ " Manufacturer | \n",
+ " Model | \n",
+ " Prod_year | \n",
+ " Category | \n",
+ " Leather_interior | \n",
+ " Fuel type | \n",
+ " Engine volume | \n",
+ " Mileage | \n",
+ " Cylinders | \n",
+ " Gear box type | \n",
+ " Drive wheels | \n",
+ " Doors | \n",
+ " Wheel | \n",
+ " Color | \n",
+ " Airbags | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 45813151 | \n",
+ " 220 | \n",
+ " 919 | \n",
+ " MERCEDES-BENZ | \n",
+ " ML 350 | \n",
+ " 2012 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Diesel | \n",
+ " 3 | \n",
+ " 209072 km | \n",
+ " 6.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Grey | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 45783744 | \n",
+ " 11000 | \n",
+ " - | \n",
+ " JEEP | \n",
+ " Liberty | \n",
+ " 2001 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " LPG | \n",
+ " 3.7 | \n",
+ " 137582 km | \n",
+ " 6.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Right-hand drive | \n",
+ " Silver | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 45805850 | \n",
+ " 10976 | \n",
+ " - | \n",
+ " TOYOTA | \n",
+ " RAV 4 | \n",
+ " 2002 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " CNG | \n",
+ " 2 | \n",
+ " 200000 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 45816409 | \n",
+ " 1568 | \n",
+ " 753 | \n",
+ " HYUNDAI | \n",
+ " Sonata | \n",
+ " 2012 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 2.4 | \n",
+ " 246230 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Black | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 45281242 | \n",
+ " 8938 | \n",
+ " 843 | \n",
+ " TOYOTA | \n",
+ " Prius | \n",
+ " 2008 | \n",
+ " Sedan | \n",
+ " No | \n",
+ " Hybrid | \n",
+ " 1.5 | \n",
+ " 133016 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Beige | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45798478 | \n",
+ " 13172 | \n",
+ " 639 | \n",
+ " FORD | \n",
+ " Focus | \n",
+ " 2014 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 2 | \n",
+ " 134400 km | \n",
+ " 4.0 | \n",
+ " Tiptronic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Red | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 45321909 | \n",
+ " 16621 | \n",
+ " - | \n",
+ " TOYOTA | \n",
+ " Prius | \n",
+ " 2010 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " Hybrid | \n",
+ " 1.8 | \n",
+ " 154000 km | \n",
+ " 4.0 | \n",
+ " Variator | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " White | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 45758118 | \n",
+ " 15681 | \n",
+ " 1811 | \n",
+ " LEXUS | \n",
+ " GX 460 | \n",
+ " 2010 | \n",
+ " Jeep | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 4.6 | \n",
+ " 275240 km | \n",
+ " 8.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Silver | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 45758137 | \n",
+ " 6476 | \n",
+ " - | \n",
+ " NISSAN | \n",
+ " Note | \n",
+ " 2008 | \n",
+ " Hatchback | \n",
+ " No | \n",
+ " CNG | \n",
+ " 1.5 | \n",
+ " 999999999 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " 4x4 | \n",
+ " 04-May | \n",
+ " Right-hand drive | \n",
+ " Black | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 45720411 | \n",
+ " 3 | \n",
+ " 697 | \n",
+ " VOLKSWAGEN | \n",
+ " Jetta | \n",
+ " 2015 | \n",
+ " Sedan | \n",
+ " Yes | \n",
+ " Petrol | \n",
+ " 1.8 Turbo | \n",
+ " 65000 km | \n",
+ " 4.0 | \n",
+ " Automatic | \n",
+ " Front | \n",
+ " 04-May | \n",
+ " Left wheel | \n",
+ " Grey | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3848 rows × 17 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Price Levy Manufacturer Model Prod_year Category \\\n",
+ "ID \n",
+ "45813151 220 919 MERCEDES-BENZ ML 350 2012 Jeep \n",
+ "45783744 11000 - JEEP Liberty 2001 Jeep \n",
+ "45805850 10976 - TOYOTA RAV 4 2002 Jeep \n",
+ "45816409 1568 753 HYUNDAI Sonata 2012 Sedan \n",
+ "45281242 8938 843 TOYOTA Prius 2008 Sedan \n",
+ "... ... ... ... ... ... ... \n",
+ "45798478 13172 639 FORD Focus 2014 Sedan \n",
+ "45321909 16621 - TOYOTA Prius 2010 Hatchback \n",
+ "45758118 15681 1811 LEXUS GX 460 2010 Jeep \n",
+ "45758137 6476 - NISSAN Note 2008 Hatchback \n",
+ "45720411 3 697 VOLKSWAGEN Jetta 2015 Sedan \n",
+ "\n",
+ " Leather_interior Fuel type Engine volume Mileage Cylinders \\\n",
+ "ID \n",
+ "45813151 Yes Diesel 3 209072 km 6.0 \n",
+ "45783744 Yes LPG 3.7 137582 km 6.0 \n",
+ "45805850 Yes CNG 2 200000 km 4.0 \n",
+ "45816409 Yes Petrol 2.4 246230 km 4.0 \n",
+ "45281242 No Hybrid 1.5 133016 km 4.0 \n",
+ "... ... ... ... ... ... \n",
+ "45798478 Yes Petrol 2 134400 km 4.0 \n",
+ "45321909 No Hybrid 1.8 154000 km 4.0 \n",
+ "45758118 Yes Petrol 4.6 275240 km 8.0 \n",
+ "45758137 No CNG 1.5 999999999 km 4.0 \n",
+ "45720411 Yes Petrol 1.8 Turbo 65000 km 4.0 \n",
+ "\n",
+ " Gear box type Drive wheels Doors Wheel Color Airbags \n",
+ "ID \n",
+ "45813151 Automatic 4x4 04-May Left wheel Grey 12 \n",
+ "45783744 Automatic 4x4 04-May Right-hand drive Silver 6 \n",
+ "45805850 Automatic 4x4 04-May Left wheel White 4 \n",
+ "45816409 Automatic Front 04-May Left wheel Black 12 \n",
+ "45281242 Automatic Front 04-May Left wheel Beige 8 \n",
+ "... ... ... ... ... ... ... \n",
+ "45798478 Tiptronic Front 04-May Left wheel Red 8 \n",
+ "45321909 Variator Front 04-May Left wheel White 6 \n",
+ "45758118 Automatic 4x4 04-May Left wheel Silver 0 \n",
+ "45758137 Automatic 4x4 04-May Right-hand drive Black 0 \n",
+ "45720411 Automatic Front 04-May Left wheel Grey 12 \n",
+ "\n",
+ "[3848 rows x 17 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Gear box type | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 45813151 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45783744 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45805850 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45816409 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45281242 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45798478 | \n",
+ " Tiptronic | \n",
+ "
\n",
+ " \n",
+ " 45321909 | \n",
+ " Variator | \n",
+ "
\n",
+ " \n",
+ " 45758118 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45758137 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ " 45720411 | \n",
+ " Automatic | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3848 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Gear box type\n",
+ "ID \n",
+ "45813151 Automatic\n",
+ "45783744 Automatic\n",
+ "45805850 Automatic\n",
+ "45816409 Automatic\n",
+ "45281242 Automatic\n",
+ "... ...\n",
+ "45798478 Tiptronic\n",
+ "45321909 Variator\n",
+ "45758118 Automatic\n",
+ "45758137 Automatic\n",
+ "45720411 Automatic\n",
+ "\n",
+ "[3848 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from utils import split_stratified_into_train_val_test\n",
+ "\n",
+ "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+ " df, stratify_colname=\"Gear box type\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state\n",
+ ")\n",
+ "\n",
+ "display(\"X_train\", X_train)\n",
+ "display(\"y_train\", y_train)\n",
+ "\n",
+ "display(\"X_test\", X_test)\n",
+ "display(\"y_test\", y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "В итоге, этот код выполняет следующие действия:\n",
+ "\n",
+ "* Заполняет пропущенные значения: В числовых столбцах медианой, в категориальных - значением \"unknown\".\n",
+ "* Стандартизирует числовые данные: приводит их к нулевому среднему и единичному стандартному отклонению.\n",
+ "* Преобразует категориальные данные: использует one-hot-кодирование.\n",
+ "* Удаляет ненужные столбцы: из списка `columns_to_drop`.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Формирование конвейера для классификации данных\n",
+ "\n",
+ "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
+ "\n",
+ "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
+ "\n",
+ "features_preprocessing -- трансформер для предобработки признаков\n",
+ "\n",
+ "features_engineering -- трансформер для конструирования признаков\n",
+ "\n",
+ "drop_columns -- трансформер для удаления колонок\n",
+ "\n",
+ "features_postprocessing -- трансформер для унитарного кодирования новых признаков\n",
+ "\n",
+ "pipeline_end -- основной конвейер предобработки данных и конструирования признаков\n",
+ "\n",
+ "Конвейер выполняется последовательно.\n",
+ "\n",
+ "Трансформер выполняет параллельно для указанного набора колонок.\n",
+ "\n",
+ "Документация: \n",
+ "\n",
+ "https://scikit-learn.org/1.5/api/sklearn.pipeline.html\n",
+ "\n",
+ "https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.discriminant_analysis import StandardScaler\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "from transformers import TitanicFeatures\n",
+ "\n",
+ "\n",
+ "#columns_to_drop = [\"Survived\", \"Name\", \"Cabin\", \"Ticket\", \"Embarked\", \"Parch\", \"Fare\"]\n",
+ "columns_to_drop = [\"Doors\", \"Color\", \"Gear box type\", \"Prod_year\", \"Mileage\", \"Airbags\", \"Levy\", \"Leather_interior\", \"Fuel type\", \"Drive wheels\"]\n",
+ "num_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype != \"object\"\n",
+ "]\n",
+ "cat_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype == \"object\"\n",
+ "]\n",
+ "\n",
+ "num_imputer = SimpleImputer(strategy=\"median\")\n",
+ "num_scaler = StandardScaler()\n",
+ "preprocessing_num = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", num_imputer),\n",
+ " (\"scaler\", num_scaler),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+ "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+ "preprocessing_cat = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", cat_imputer),\n",
+ " (\"encoder\", cat_encoder),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+ " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+ " #(\"prepocessing_features\", cat_imputer, [\"Name\", \"Cabin\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\"\n",
+ ")\n",
+ "\n",
+ "# features_engineering = ColumnTransformer(\n",
+ "# verbose_feature_names_out=False,\n",
+ "# transformers=[\n",
+ "# (\"add_features\", TitanicFeatures(), [\"Name\", \"Cabin\"]),\n",
+ "# ],\n",
+ "# remainder=\"passthrough\",\n",
+ "# )\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "# features_postprocessing = ColumnTransformer(\n",
+ "# verbose_feature_names_out=False,\n",
+ "# transformers=[\n",
+ "# (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
+ "# ],\n",
+ "# remainder=\"passthrough\",\n",
+ "# )\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " # (\"features_engineering\", features_engineering),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " # (\"features_postprocessing\", features_postprocessing),\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Демонстрация работы конвейера для предобработки данных при классификации"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Price | \n",
+ " Cylinders | \n",
+ " Manufacturer_ALFA ROMEO | \n",
+ " Manufacturer_ASTON MARTIN | \n",
+ " Manufacturer_AUDI | \n",
+ " Manufacturer_BENTLEY | \n",
+ " Manufacturer_BMW | \n",
+ " Manufacturer_BUICK | \n",
+ " Manufacturer_CADILLAC | \n",
+ " Manufacturer_CHEVROLET | \n",
+ " ... | \n",
+ " Engine volume_5.7 Turbo | \n",
+ " Engine volume_5.8 | \n",
+ " Engine volume_5.9 | \n",
+ " Engine volume_6 | \n",
+ " Engine volume_6.2 | \n",
+ " Engine volume_6.3 | \n",
+ " Engine volume_6.3 Turbo | \n",
+ " Engine volume_6.7 | \n",
+ " Engine volume_6.8 | \n",
+ " Wheel_Right-hand drive | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 45758153 | \n",
+ " -0.082497 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45699930 | \n",
+ " -0.007675 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45646562 | \n",
+ " -0.082497 | \n",
+ " 1.187062 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45656923 | \n",
+ " -0.042322 | \n",
+ " 1.187062 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45815887 | \n",
+ " -0.037165 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45802363 | \n",
+ " 0.013743 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45812777 | \n",
+ " -0.087729 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 44104417 | \n",
+ " -0.017260 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 45793406 | \n",
+ " -0.074021 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45700700 | \n",
+ " -0.000304 | \n",
+ " -0.485038 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
15389 rows × 1573 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Price Cylinders Manufacturer_ALFA ROMEO \\\n",
+ "ID \n",
+ "45758153 -0.082497 -0.485038 0.0 \n",
+ "45699930 -0.007675 -0.485038 0.0 \n",
+ "45646562 -0.082497 1.187062 0.0 \n",
+ "45656923 -0.042322 1.187062 0.0 \n",
+ "45815887 -0.037165 -0.485038 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.013743 -0.485038 0.0 \n",
+ "45812777 -0.087729 -0.485038 0.0 \n",
+ "44104417 -0.017260 -0.485038 0.0 \n",
+ "45793406 -0.074021 -0.485038 0.0 \n",
+ "45700700 -0.000304 -0.485038 0.0 \n",
+ "\n",
+ " Manufacturer_ASTON MARTIN Manufacturer_AUDI Manufacturer_BENTLEY \\\n",
+ "ID \n",
+ "45758153 0.0 0.0 0.0 \n",
+ "45699930 0.0 0.0 0.0 \n",
+ "45646562 0.0 0.0 0.0 \n",
+ "45656923 0.0 0.0 0.0 \n",
+ "45815887 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.0 0.0 0.0 \n",
+ "45812777 0.0 0.0 0.0 \n",
+ "44104417 0.0 0.0 0.0 \n",
+ "45793406 0.0 0.0 0.0 \n",
+ "45700700 0.0 0.0 0.0 \n",
+ "\n",
+ " Manufacturer_BMW Manufacturer_BUICK Manufacturer_CADILLAC \\\n",
+ "ID \n",
+ "45758153 0.0 0.0 0.0 \n",
+ "45699930 0.0 0.0 0.0 \n",
+ "45646562 0.0 0.0 0.0 \n",
+ "45656923 0.0 0.0 0.0 \n",
+ "45815887 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.0 0.0 0.0 \n",
+ "45812777 0.0 0.0 0.0 \n",
+ "44104417 0.0 0.0 0.0 \n",
+ "45793406 0.0 0.0 0.0 \n",
+ "45700700 0.0 0.0 0.0 \n",
+ "\n",
+ " Manufacturer_CHEVROLET ... Engine volume_5.7 Turbo \\\n",
+ "ID ... \n",
+ "45758153 0.0 ... 0.0 \n",
+ "45699930 0.0 ... 0.0 \n",
+ "45646562 0.0 ... 0.0 \n",
+ "45656923 0.0 ... 0.0 \n",
+ "45815887 0.0 ... 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.0 ... 0.0 \n",
+ "45812777 0.0 ... 0.0 \n",
+ "44104417 0.0 ... 0.0 \n",
+ "45793406 0.0 ... 0.0 \n",
+ "45700700 0.0 ... 0.0 \n",
+ "\n",
+ " Engine volume_5.8 Engine volume_5.9 Engine volume_6 \\\n",
+ "ID \n",
+ "45758153 0.0 0.0 0.0 \n",
+ "45699930 0.0 0.0 0.0 \n",
+ "45646562 0.0 0.0 0.0 \n",
+ "45656923 0.0 0.0 0.0 \n",
+ "45815887 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.0 0.0 0.0 \n",
+ "45812777 0.0 0.0 0.0 \n",
+ "44104417 0.0 0.0 0.0 \n",
+ "45793406 0.0 0.0 0.0 \n",
+ "45700700 0.0 0.0 0.0 \n",
+ "\n",
+ " Engine volume_6.2 Engine volume_6.3 Engine volume_6.3 Turbo \\\n",
+ "ID \n",
+ "45758153 0.0 0.0 0.0 \n",
+ "45699930 0.0 0.0 0.0 \n",
+ "45646562 0.0 0.0 0.0 \n",
+ "45656923 0.0 0.0 0.0 \n",
+ "45815887 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.0 0.0 0.0 \n",
+ "45812777 0.0 0.0 0.0 \n",
+ "44104417 0.0 0.0 0.0 \n",
+ "45793406 0.0 0.0 0.0 \n",
+ "45700700 0.0 0.0 0.0 \n",
+ "\n",
+ " Engine volume_6.7 Engine volume_6.8 Wheel_Right-hand drive \n",
+ "ID \n",
+ "45758153 0.0 0.0 0.0 \n",
+ "45699930 0.0 0.0 0.0 \n",
+ "45646562 0.0 0.0 0.0 \n",
+ "45656923 0.0 0.0 0.0 \n",
+ "45815887 0.0 0.0 0.0 \n",
+ "... ... ... ... \n",
+ "45802363 0.0 0.0 0.0 \n",
+ "45812777 0.0 0.0 0.0 \n",
+ "44104417 0.0 0.0 1.0 \n",
+ "45793406 0.0 0.0 0.0 \n",
+ "45700700 0.0 0.0 0.0 \n",
+ "\n",
+ "[15389 rows x 1573 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "preprocessed_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Формирование набора моделей для классификации\n",
+ "\n",
+ "logistic -- логистическая регрессия\n",
+ "\n",
+ "ridge -- гребневая регрессия\n",
+ "\n",
+ "decision_tree -- дерево решений\n",
+ "\n",
+ "knn -- k-ближайших соседей\n",
+ "\n",
+ "naive_bayes -- наивный Байесовский классификатор\n",
+ "\n",
+ "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
+ "\n",
+ "random_forest -- метод случайного леса (набор деревьев решений)\n",
+ "\n",
+ "mlp -- многослойный персептрон (нейронная сеть)\n",
+ "\n",
+ "Документация: https://scikit-learn.org/1.5/supervised_learning.html"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
+ "\n",
+ "class_models = {\n",
+ " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
+ " # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
+ " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
+ " \"decision_tree\": {\n",
+ " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
+ " },\n",
+ " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
+ " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
+ " \"gradient_boosting\": {\n",
+ " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
+ " },\n",
+ " \"random_forest\": {\n",
+ " \"model\": ensemble.RandomForestClassifier(\n",
+ " max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
+ " )\n",
+ " },\n",
+ " \"mlp\": {\n",
+ " \"model\": neural_network.MLPClassifier(\n",
+ " hidden_layer_sizes=(7,),\n",
+ " max_iter=100000,\n",
+ " early_stopping=True,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Gear box type object\n",
+ "dtype: object\n",
+ "Gear box type object\n",
+ "dtype: object\n",
+ "\n",
+ "Index: 19237 entries, 45654403 to 45813273\n",
+ "Data columns (total 17 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Price 19237 non-null int64 \n",
+ " 1 Levy 19237 non-null object \n",
+ " 2 Manufacturer 19237 non-null object \n",
+ " 3 Model 19237 non-null object \n",
+ " 4 Prod_year 19237 non-null int64 \n",
+ " 5 Category 19237 non-null object \n",
+ " 6 Leather_interior 19237 non-null object \n",
+ " 7 Fuel type 19237 non-null object \n",
+ " 8 Engine volume 19237 non-null object \n",
+ " 9 Mileage 19237 non-null object \n",
+ " 10 Cylinders 19237 non-null float64\n",
+ " 11 Gear box type 19237 non-null object \n",
+ " 12 Drive wheels 19237 non-null object \n",
+ " 13 Doors 19237 non-null object \n",
+ " 14 Wheel 19237 non-null object \n",
+ " 15 Color 19237 non-null object \n",
+ " 16 Airbags 19237 non-null int64 \n",
+ "dtypes: float64(1), int64(3), object(13)\n",
+ "memory usage: 2.6+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(y_train.dtypes)\n",
+ "print(y_test.dtypes)\n",
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Обучение моделей на обучающем наборе данных и оценка на тестовом"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: logistic\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+ "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+ "\n",
+ "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+ " https://scikit-learn.org/stable/modules/preprocessing.html\n",
+ "Please also refer to the documentation for alternative solver options:\n",
+ " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+ " n_iter_i = _check_optimize_result(\n",
+ "c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1, 3] during transform. These unknown categories will be encoded as all zeros\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "ename": "ValueError",
+ "evalue": "Mix of label input types (string and number)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[16], line 28\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# class_models[model_name][\"Precision_train\"] = metrics.precision_score(\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# y_train, y_train_predict, average=\"micro\"\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# class_models[model_name][\"Precision_test\"] = metrics.precision_score(\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# y_test, y_test_predict\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 25\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecall_train\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39mrecall_score(\n\u001b[0;32m 26\u001b[0m y_train, y_train_predict, average\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmicro\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 27\u001b[0m )\n\u001b[1;32m---> 28\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecall_test\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecall_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test_predict\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 31\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy_train\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39maccuracy_score(\n\u001b[0;32m 32\u001b[0m y_train, y_train_predict\n\u001b[0;32m 33\u001b[0m )\n\u001b[0;32m 34\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy_test\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39maccuracy_score(\n\u001b[0;32m 35\u001b[0m y_test, y_test_predict\n\u001b[0;32m 36\u001b[0m )\n",
+ "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
+ "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:2385\u001b[0m, in \u001b[0;36mrecall_score\u001b[1;34m(y_true, y_pred, labels, pos_label, average, sample_weight, zero_division)\u001b[0m\n\u001b[0;32m 2217\u001b[0m \u001b[38;5;129m@validate_params\u001b[39m(\n\u001b[0;32m 2218\u001b[0m {\n\u001b[0;32m 2219\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_true\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray-like\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msparse matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2244\u001b[0m zero_division\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwarn\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 2245\u001b[0m ):\n\u001b[0;32m 2246\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Compute the recall.\u001b[39;00m\n\u001b[0;32m 2247\u001b[0m \n\u001b[0;32m 2248\u001b[0m \u001b[38;5;124;03m The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2383\u001b[0m \u001b[38;5;124;03m array([1. , 1. , 0.5])\u001b[39;00m\n\u001b[0;32m 2384\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2385\u001b[0m _, r, _, _ \u001b[38;5;241m=\u001b[39m \u001b[43mprecision_recall_fscore_support\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2386\u001b[0m \u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2387\u001b[0m \u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2388\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2389\u001b[0m \u001b[43m \u001b[49m\u001b[43mpos_label\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpos_label\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2390\u001b[0m \u001b[43m \u001b[49m\u001b[43maverage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maverage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2391\u001b[0m \u001b[43m \u001b[49m\u001b[43mwarn_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecall\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2392\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2393\u001b[0m \u001b[43m \u001b[49m\u001b[43mzero_division\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mzero_division\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2394\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2395\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n",
+ "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:186\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 184\u001b[0m global_skip_validation \u001b[38;5;241m=\u001b[39m get_config()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip_parameter_validation\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m global_skip_validation:\n\u001b[1;32m--> 186\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 188\u001b[0m func_sig \u001b[38;5;241m=\u001b[39m signature(func)\n\u001b[0;32m 190\u001b[0m \u001b[38;5;66;03m# Map *args/**kwargs to the function signature\u001b[39;00m\n",
+ "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1789\u001b[0m, in \u001b[0;36mprecision_recall_fscore_support\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight, zero_division)\u001b[0m\n\u001b[0;32m 1626\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute precision, recall, F-measure and support for each class.\u001b[39;00m\n\u001b[0;32m 1627\u001b[0m \n\u001b[0;32m 1628\u001b[0m \u001b[38;5;124;03mThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1786\u001b[0m \u001b[38;5;124;03m array([2, 2, 2]))\u001b[39;00m\n\u001b[0;32m 1787\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1788\u001b[0m _check_zero_division(zero_division)\n\u001b[1;32m-> 1789\u001b[0m labels \u001b[38;5;241m=\u001b[39m \u001b[43m_check_set_wise_labels\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maverage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpos_label\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1791\u001b[0m \u001b[38;5;66;03m# Calculate tp_sum, pred_sum, true_sum ###\u001b[39;00m\n\u001b[0;32m 1792\u001b[0m samplewise \u001b[38;5;241m=\u001b[39m average \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msamples\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+ "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1564\u001b[0m, in \u001b[0;36m_check_set_wise_labels\u001b[1;34m(y_true, y_pred, average, labels, pos_label)\u001b[0m\n\u001b[0;32m 1561\u001b[0m y_type, y_true, y_pred \u001b[38;5;241m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m 1562\u001b[0m \u001b[38;5;66;03m# Convert to Python primitive type to avoid NumPy type / Python str\u001b[39;00m\n\u001b[0;32m 1563\u001b[0m \u001b[38;5;66;03m# comparison. See https://github.com/numpy/numpy/issues/6784\u001b[39;00m\n\u001b[1;32m-> 1564\u001b[0m present_labels \u001b[38;5;241m=\u001b[39m \u001b[43munique_labels\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtolist()\n\u001b[0;32m 1565\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m average \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 1566\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+ "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\multiclass.py:114\u001b[0m, in \u001b[0;36munique_labels\u001b[1;34m(*ys)\u001b[0m\n\u001b[0;32m 112\u001b[0m \u001b[38;5;66;03m# Check that we don't mix string type with number type\u001b[39;00m\n\u001b[0;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mset\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(label, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m label \u001b[38;5;129;01min\u001b[39;00m ys_labels)) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 114\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMix of label input types (string and number)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m xp\u001b[38;5;241m.\u001b[39masarray(\u001b[38;5;28msorted\u001b[39m(ys_labels))\n",
+ "\u001b[1;31mValueError\u001b[0m: Mix of label input types (string and number)"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "for model_name in class_models.keys():\n",
+ " print(f\"Model: {model_name}\")\n",
+ " model = class_models[model_name][\"model\"]\n",
+ "\n",
+ " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
+ " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
+ "\n",
+ " y_train_predict = model_pipeline.predict(X_train)\n",
+ " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
+ " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
+ "\n",
+ " class_models[model_name][\"pipeline\"] = model_pipeline\n",
+ " class_models[model_name][\"probs\"] = y_test_probs\n",
+ " class_models[model_name][\"preds\"] = y_test_predict\n",
+ "\n",
+ " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
+ " y_train, y_train_predict, average=\"micro\"\n",
+ " )\n",
+ " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
+ " y_train, y_train_predict, average=\"micro\"\n",
+ " )\n",
+ " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
+ " y_test, y_test_probs\n",
+ " )\n",
+ " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
+ " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
+ " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
+ " y_test, y_test_predict\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Сводная таблица оценок качества для использованных моделей классификации\n",
+ "\n",
+ "Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Матрица неточностей"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import ConfusionMatrixDisplay\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
+ "for index, key in enumerate(class_models.keys()):\n",
+ " c_matrix = class_models[key][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"Died\", \"Sirvived\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ " disp.ax_.set_title(key)\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Точность, полнота, верность (аккуратность), F-мера"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(\n",
+ " by=\"Accuracy_test\", ascending=False\n",
+ ").style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
+ "\n",
+ "display(best_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Вывод данных с ошибкой предсказания для оценки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preprocessing_result = pipeline_end.transform(X_test)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "y_pred = class_models[best_model][\"preds\"]\n",
+ "\n",
+ "error_index = y_test[y_test[\"Survived\"] != y_pred].index.tolist()\n",
+ "display(f\"Error items count: {len(error_index)}\")\n",
+ "\n",
+ "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
+ "error_df = X_test.loc[error_index].copy()\n",
+ "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
+ "error_df.sort_index()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Пример использования обученной модели (конвейера) для предсказания"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = class_models[best_model][\"pipeline\"]\n",
+ "\n",
+ "example_id = 450\n",
+ "test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
+ "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
+ "display(test)\n",
+ "display(test_preprocessed)\n",
+ "result_proba = model.predict_proba(test)[0]\n",
+ "result = model.predict(test)[0]\n",
+ "real = int(y_test.loc[example_id].values[0])\n",
+ "display(f\"predicted: {result} (proba: {result_proba})\")\n",
+ "display(f\"real: {real}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Подбор гиперпараметров методом поиска по сетке\n",
+ "\n",
+ "https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv\n",
+ "\n",
+ "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "optimized_model_type = \"random_forest\"\n",
+ "\n",
+ "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
+ "\n",
+ "param_grid = {\n",
+ " \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n",
+ " \"model__max_features\": [\"sqrt\", \"log2\", 2],\n",
+ " \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10],\n",
+ " \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n",
+ "}\n",
+ "\n",
+ "gs_optomizer = GridSearchCV(\n",
+ " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
+ ")\n",
+ "gs_optomizer.fit(X_train, y_train.values.ravel())\n",
+ "gs_optomizer.best_params_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Обучение модели с новыми гиперпараметрами"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_model = ensemble.RandomForestClassifier(\n",
+ " random_state=random_state,\n",
+ " criterion=\"gini\",\n",
+ " max_depth=7,\n",
+ " max_features=\"sqrt\",\n",
+ " n_estimators=30,\n",
+ ")\n",
+ "\n",
+ "result = {}\n",
+ "\n",
+ "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
+ "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
+ "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
+ "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
+ "\n",
+ "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
+ "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
+ "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
+ "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
+ "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
+ "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
+ "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
+ "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
+ "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование данных для оценки старой и новой версии модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=class_models[optimized_model_type]\n",
+ ")\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=result\n",
+ ")\n",
+ "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
+ "optimized_metrics = optimized_metrics.set_index(\"Name\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Оценка параметров старой и новой модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
+ ")\n",
+ "\n",
+ "for index in range(0, len(optimized_metrics)):\n",
+ " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"Died\", \"Sirvived\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/transformers.py b/transformers.py
new file mode 100644
index 0000000..4c4342b
--- /dev/null
+++ b/transformers.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class TitanicFeatures(BaseEstimator, TransformerMixin):
+ def __init__(self):
+ pass
+
+ def fit(self, X, y=None):
+ return self
+
+ def transform(self, X, y=None):
+ def get_title(name) -> str:
+ return name.split(",")[1].split(".")[0].strip()
+
+ def get_cabin_type(cabin) -> str:
+ if pd.isna(cabin):
+ return "unknown"
+ return cabin[0]
+
+ X["Is_married"] = [1 if get_title(name) == "Mrs" else 0 for name in X["Name"]]
+ X["Cabin_type"] = [get_cabin_type(cabin) for cabin in X["Cabin"]]
+ return X
+
+ def get_feature_names_out(self, features_in):
+ return np.append(features_in, ["Is_married", "Cabin_type"], axis=0)
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..cb8c396
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,79 @@
+from typing import Tuple
+
+import pandas as pd
+from pandas import DataFrame
+from sklearn.model_selection import train_test_split
+
+
+def split_stratified_into_train_val_test(
+ df_input,
+ stratify_colname="y",
+ frac_train=0.6,
+ frac_val=0.15,
+ frac_test=0.25,
+ random_state=None,
+) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
+ """
+ Splits a Pandas dataframe into three subsets (train, val, and test)
+ following fractional ratios provided by the user, where each subset is
+ stratified by the values in a specific column (that is, each subset has
+ the same relative frequency of the values in the column). It performs this
+ splitting by running train_test_split() twice.
+
+ Parameters
+ ----------
+ df_input : Pandas dataframe
+ Input dataframe to be split.
+ stratify_colname : str
+ The name of the column that will be used for stratification. Usually
+ this column would be for the label.
+ frac_train : float
+ frac_val : float
+ frac_test : float
+ The ratios with which the dataframe will be split into train, val, and
+ test data. The values should be expressed as float fractions and should
+ sum to 1.0.
+ random_state : int, None, or RandomStateInstance
+ Value to be passed to train_test_split().
+
+ Returns
+ -------
+ df_train, df_val, df_test :
+ Dataframes containing the three splits.
+ """
+
+ if frac_train + frac_val + frac_test != 1.0:
+ raise ValueError(
+ "fractions %f, %f, %f do not add up to 1.0"
+ % (frac_train, frac_val, frac_test)
+ )
+
+ if stratify_colname not in df_input.columns:
+ raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
+
+ X = df_input # Contains all columns.
+ y = df_input[
+ [stratify_colname]
+ ] # Dataframe of just the column on which to stratify.
+
+ # Split original dataframe into train and temp dataframes.
+ df_train, df_temp, y_train, y_temp = train_test_split(
+ X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
+ )
+
+ if frac_val <= 0:
+ assert len(df_input) == len(df_train) + len(df_temp)
+ return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
+
+ # Split the temp dataframe into val and test dataframes.
+ relative_frac_test = frac_test / (frac_val + frac_test)
+ df_val, df_test, y_val, y_test = train_test_split(
+ df_temp,
+ y_temp,
+ stratify=y_temp,
+ test_size=relative_frac_test,
+ random_state=random_state,
+ )
+
+ assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
+ return df_train, df_val, df_test, y_train, y_val, y_test