diff --git a/lec3.ipynb b/lec3.ipynb index 40f4903..9c7cf09 100644 --- a/lec3.ipynb +++ b/lec3.ipynb @@ -18,9 +18,268 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Population 2020Yearly ChangeNet ChangeDensity(P/Km²)Land Area (Km²)Migrants (net)Fert. RateMedAgeUrban Pop %World Share
no
1China14393237760.3955400901539388211-348,3991.73861%18.47%
2India13800043850.99135866314642973190-532,6872.22835%17.70%
3United States3310026510.591937734369147420954,8061.83883%4.25%
4Indonesia2735236151.0728980471511811570-98,9552.33056%3.51%
5Pakistan2208923402.004327022287770880-233,3793.62335%2.83%
....................................
231Montserrat49920.06350100NaNN.A.N.A.10%0.00%
232Falkland Islands34803.05103012170NaNN.A.N.A.66%0.00%
233Niue16260.68116260NaNN.A.N.A.46%0.00%
234Tokelau13571.271713610NaNN.A.N.A.0%0.00%
235Holy See8010.2522,0030NaNN.A.N.A.N.A.0.00%
\n", + "

235 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Population 2020 Yearly Change Net Change \\\n", + "no \n", + "1 China 1439323776 0.39 5540090 \n", + "2 India 1380004385 0.99 13586631 \n", + "3 United States 331002651 0.59 1937734 \n", + "4 Indonesia 273523615 1.07 2898047 \n", + "5 Pakistan 220892340 2.00 4327022 \n", + ".. ... ... ... ... \n", + "231 Montserrat 4992 0.06 3 \n", + "232 Falkland Islands 3480 3.05 103 \n", + "233 Niue 1626 0.68 11 \n", + "234 Tokelau 1357 1.27 17 \n", + "235 Holy See 801 0.25 2 \n", + "\n", + " Density(P/Km²) Land Area (Km²) Migrants (net) Fert. Rate MedAge \\\n", + "no \n", + "1 153 9388211 -348,399 1.7 38 \n", + "2 464 2973190 -532,687 2.2 28 \n", + "3 36 9147420 954,806 1.8 38 \n", + "4 151 1811570 -98,955 2.3 30 \n", + "5 287 770880 -233,379 3.6 23 \n", + ".. ... ... ... ... ... \n", + "231 50 100 NaN N.A. N.A. \n", + "232 0 12170 NaN N.A. N.A. \n", + "233 6 260 NaN N.A. N.A. \n", + "234 136 10 NaN N.A. N.A. \n", + "235 2,003 0 NaN N.A. N.A. \n", + "\n", + " Urban Pop % World Share \n", + "no \n", + "1 61% 18.47% \n", + "2 35% 17.70% \n", + "3 83% 4.25% \n", + "4 56% 3.51% \n", + "5 35% 2.83% \n", + ".. ... ... \n", + "231 10% 0.00% \n", + "232 66% 0.00% \n", + "233 46% 0.00% \n", + "234 0% 0.00% \n", + "235 N.A. 0.00% \n", + "\n", + "[235 rows x 11 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -59,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -86,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -121,9 +380,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0. , 5458956.66666667, 10917913.33333333,\n", + " 16376870. ]),\n", + " array([229, 5, 1]))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "hist1, bins1 = np.histogram(\n", " countries[\"Land Area (Km²)\"].fillna(countries[\"Land Area (Km²)\"].median()), bins=num_bins\n", @@ -133,9 +405,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211(5458956.667, 10917913.333]
2India2973190(0.0, 5458956.667]
3United States9147420(5458956.667, 10917913.333]
4Indonesia1811570(0.0, 5458956.667]
5Pakistan770880(0.0, 5458956.667]
6Brazil8358140(5458956.667, 10917913.333]
7Nigeria910770(0.0, 5458956.667]
8Bangladesh130170(0.0, 5458956.667]
9Russia16376870(10917913.333, 16376870.0]
10Mexico1943950(0.0, 5458956.667]
11Japan364555(0.0, 5458956.667]
12Ethiopia1000000(0.0, 5458956.667]
13Philippines298170(0.0, 5458956.667]
14Egypt995450(0.0, 5458956.667]
15Vietnam310070(0.0, 5458956.667]
16DR Congo2267050(0.0, 5458956.667]
17Turkey769630(0.0, 5458956.667]
18Iran1628550(0.0, 5458956.667]
19Germany348560(0.0, 5458956.667]
20Thailand510890(0.0, 5458956.667]
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 (5458956.667, 10917913.333]\n", + "2 India 2973190 (0.0, 5458956.667]\n", + "3 United States 9147420 (5458956.667, 10917913.333]\n", + "4 Indonesia 1811570 (0.0, 5458956.667]\n", + "5 Pakistan 770880 (0.0, 5458956.667]\n", + "6 Brazil 8358140 (5458956.667, 10917913.333]\n", + "7 Nigeria 910770 (0.0, 5458956.667]\n", + "8 Bangladesh 130170 (0.0, 5458956.667]\n", + "9 Russia 16376870 (10917913.333, 16376870.0]\n", + "10 Mexico 1943950 (0.0, 5458956.667]\n", + "11 Japan 364555 (0.0, 5458956.667]\n", + "12 Ethiopia 1000000 (0.0, 5458956.667]\n", + "13 Philippines 298170 (0.0, 5458956.667]\n", + "14 Egypt 995450 (0.0, 5458956.667]\n", + "15 Vietnam 310070 (0.0, 5458956.667]\n", + "16 DR Congo 2267050 (0.0, 5458956.667]\n", + "17 Turkey 769630 (0.0, 5458956.667]\n", + "18 Iran 1628550 (0.0, 5458956.667]\n", + "19 Germany 348560 (0.0, 5458956.667]\n", + "20 Thailand 510890 (0.0, 5458956.667]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -149,9 +608,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211Middle
2India2973190Small
3United States9147420Middle
4Indonesia1811570Small
5Pakistan770880Small
6Brazil8358140Middle
7Nigeria910770Small
8Bangladesh130170Small
9Russia16376870Big
10Mexico1943950Small
11Japan364555Small
12Ethiopia1000000Small
13Philippines298170Small
14Egypt995450Small
15Vietnam310070Small
16DR Congo2267050Small
17Turkey769630Small
18Iran1628550Small
19Germany348560Small
20Thailand510890Small
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 Middle\n", + "2 India 2973190 Small\n", + "3 United States 9147420 Middle\n", + "4 Indonesia 1811570 Small\n", + "5 Pakistan 770880 Small\n", + "6 Brazil 8358140 Middle\n", + "7 Nigeria 910770 Small\n", + "8 Bangladesh 130170 Small\n", + "9 Russia 16376870 Big\n", + "10 Mexico 1943950 Small\n", + "11 Japan 364555 Small\n", + "12 Ethiopia 1000000 Small\n", + "13 Philippines 298170 Small\n", + "14 Egypt 995450 Small\n", + "15 Vietnam 310070 Small\n", + "16 DR Congo 2267050 Small\n", + "17 Turkey 769630 Small\n", + "18 Iran 1628550 Small\n", + "19 Germany 348560 Small\n", + "20 Thailand 510890 Small" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -167,14 +813,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Равномерное разделение данных на 3 группы c установкой собственной границы диапазона значений (от 0 до 100)" + "Равномерное разделение данных на 4 группы c установкой собственной границы диапазона значений (от 0 до 12000000) просто ставим нименьшее и наибольшее и ставим колво групп" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0., 4000000., 8000000., 12000000.]),\n", + " array([229, 1, 4, 1]))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "labels = [\"Small\", \"Middle\", \"Big\"]\n", "bins2 = np.linspace(0, 12000000, 4)\n", @@ -190,9 +848,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211(8000000.0, 12000000.0]
2India2973190(0.0, 4000000.0]
3United States9147420(8000000.0, 12000000.0]
4Indonesia1811570(0.0, 4000000.0]
5Pakistan770880(0.0, 4000000.0]
6Brazil8358140(8000000.0, 12000000.0]
7Nigeria910770(0.0, 4000000.0]
8Bangladesh130170(0.0, 4000000.0]
9Russia16376870NaN
10Mexico1943950(0.0, 4000000.0]
11Japan364555(0.0, 4000000.0]
12Ethiopia1000000(0.0, 4000000.0]
13Philippines298170(0.0, 4000000.0]
14Egypt995450(0.0, 4000000.0]
15Vietnam310070(0.0, 4000000.0]
16DR Congo2267050(0.0, 4000000.0]
17Turkey769630(0.0, 4000000.0]
18Iran1628550(0.0, 4000000.0]
19Germany348560(0.0, 4000000.0]
20Thailand510890(0.0, 4000000.0]
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 (8000000.0, 12000000.0]\n", + "2 India 2973190 (0.0, 4000000.0]\n", + "3 United States 9147420 (8000000.0, 12000000.0]\n", + "4 Indonesia 1811570 (0.0, 4000000.0]\n", + "5 Pakistan 770880 (0.0, 4000000.0]\n", + "6 Brazil 8358140 (8000000.0, 12000000.0]\n", + "7 Nigeria 910770 (0.0, 4000000.0]\n", + "8 Bangladesh 130170 (0.0, 4000000.0]\n", + "9 Russia 16376870 NaN\n", + "10 Mexico 1943950 (0.0, 4000000.0]\n", + "11 Japan 364555 (0.0, 4000000.0]\n", + "12 Ethiopia 1000000 (0.0, 4000000.0]\n", + "13 Philippines 298170 (0.0, 4000000.0]\n", + "14 Egypt 995450 (0.0, 4000000.0]\n", + "15 Vietnam 310070 (0.0, 4000000.0]\n", + "16 DR Congo 2267050 (0.0, 4000000.0]\n", + "17 Turkey 769630 (0.0, 4000000.0]\n", + "18 Iran 1628550 (0.0, 4000000.0]\n", + "19 Germany 348560 (0.0, 4000000.0]\n", + "20 Thailand 510890 (0.0, 4000000.0]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -206,9 +1051,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211Big
2India2973190Small
3United States9147420Big
4Indonesia1811570Small
5Pakistan770880Small
6Brazil8358140Big
7Nigeria910770Small
8Bangladesh130170Small
9Russia16376870NaN
10Mexico1943950Small
11Japan364555Small
12Ethiopia1000000Small
13Philippines298170Small
14Egypt995450Small
15Vietnam310070Small
16DR Congo2267050Small
17Turkey769630Small
18Iran1628550Small
19Germany348560Small
20Thailand510890Small
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 Big\n", + "2 India 2973190 Small\n", + "3 United States 9147420 Big\n", + "4 Indonesia 1811570 Small\n", + "5 Pakistan 770880 Small\n", + "6 Brazil 8358140 Big\n", + "7 Nigeria 910770 Small\n", + "8 Bangladesh 130170 Small\n", + "9 Russia 16376870 NaN\n", + "10 Mexico 1943950 Small\n", + "11 Japan 364555 Small\n", + "12 Ethiopia 1000000 Small\n", + "13 Philippines 298170 Small\n", + "14 Egypt 995450 Small\n", + "15 Vietnam 310070 Small\n", + "16 DR Congo 2267050 Small\n", + "17 Turkey 769630 Small\n", + "18 Iran 1628550 Small\n", + "19 Germany 348560 Small\n", + "20 Thailand 510890 Small" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -224,14 +1256,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Равномерное разделение данных на 3 группы c установкой собственных интервалов (0 - 39, 40 - 60, 61 - 100)" + "Равномерное разделение данных на 5 групп c установкой собственных интервалов (0 - 1000, 1000 - 100000, 100000 - 500000, 500000 - 3000000, 3000000 И БОЛЕЕ)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(array([0.e+00, 1.e+03, 1.e+05, 5.e+05, 3.e+06, inf]),\n", + " array([52, 77, 56, 44, 6]))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "labels2 = [\"Dwarf\", \"Small\", \"Middle\", \"Big\", \"Giant\"]\n", "hist3, bins3 = np.histogram(\n", @@ -245,9 +1289,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211(3000000.0, inf]
2India2973190(500000.0, 3000000.0]
3United States9147420(3000000.0, inf]
4Indonesia1811570(500000.0, 3000000.0]
5Pakistan770880(500000.0, 3000000.0]
6Brazil8358140(3000000.0, inf]
7Nigeria910770(500000.0, 3000000.0]
8Bangladesh130170(100000.0, 500000.0]
9Russia16376870(3000000.0, inf]
10Mexico1943950(500000.0, 3000000.0]
11Japan364555(100000.0, 500000.0]
12Ethiopia1000000(500000.0, 3000000.0]
13Philippines298170(100000.0, 500000.0]
14Egypt995450(500000.0, 3000000.0]
15Vietnam310070(100000.0, 500000.0]
16DR Congo2267050(500000.0, 3000000.0]
17Turkey769630(500000.0, 3000000.0]
18Iran1628550(500000.0, 3000000.0]
19Germany348560(100000.0, 500000.0]
20Thailand510890(500000.0, 3000000.0]
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 (3000000.0, inf]\n", + "2 India 2973190 (500000.0, 3000000.0]\n", + "3 United States 9147420 (3000000.0, inf]\n", + "4 Indonesia 1811570 (500000.0, 3000000.0]\n", + "5 Pakistan 770880 (500000.0, 3000000.0]\n", + "6 Brazil 8358140 (3000000.0, inf]\n", + "7 Nigeria 910770 (500000.0, 3000000.0]\n", + "8 Bangladesh 130170 (100000.0, 500000.0]\n", + "9 Russia 16376870 (3000000.0, inf]\n", + "10 Mexico 1943950 (500000.0, 3000000.0]\n", + "11 Japan 364555 (100000.0, 500000.0]\n", + "12 Ethiopia 1000000 (500000.0, 3000000.0]\n", + "13 Philippines 298170 (100000.0, 500000.0]\n", + "14 Egypt 995450 (500000.0, 3000000.0]\n", + "15 Vietnam 310070 (100000.0, 500000.0]\n", + "16 DR Congo 2267050 (500000.0, 3000000.0]\n", + "17 Turkey 769630 (500000.0, 3000000.0]\n", + "18 Iran 1628550 (500000.0, 3000000.0]\n", + "19 Germany 348560 (100000.0, 500000.0]\n", + "20 Thailand 510890 (500000.0, 3000000.0]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -261,9 +1492,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211Giant
2India2973190Big
3United States9147420Giant
4Indonesia1811570Big
5Pakistan770880Big
6Brazil8358140Giant
7Nigeria910770Big
8Bangladesh130170Middle
9Russia16376870Giant
10Mexico1943950Big
11Japan364555Middle
12Ethiopia1000000Big
13Philippines298170Middle
14Egypt995450Big
15Vietnam310070Middle
16DR Congo2267050Big
17Turkey769630Big
18Iran1628550Big
19Germany348560Middle
20Thailand510890Big
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 Giant\n", + "2 India 2973190 Big\n", + "3 United States 9147420 Giant\n", + "4 Indonesia 1811570 Big\n", + "5 Pakistan 770880 Big\n", + "6 Brazil 8358140 Giant\n", + "7 Nigeria 910770 Big\n", + "8 Bangladesh 130170 Middle\n", + "9 Russia 16376870 Giant\n", + "10 Mexico 1943950 Big\n", + "11 Japan 364555 Middle\n", + "12 Ethiopia 1000000 Big\n", + "13 Philippines 298170 Middle\n", + "14 Egypt 995450 Big\n", + "15 Vietnam 310070 Middle\n", + "16 DR Congo 2267050 Big\n", + "17 Turkey 769630 Big\n", + "18 Iran 1628550 Big\n", + "19 Germany 348560 Middle\n", + "20 Thailand 510890 Big" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -284,9 +1702,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China93882114
2India29731904
3United States91474204
4Indonesia18115704
5Pakistan7708804
6Brazil83581404
7Nigeria9107704
8Bangladesh1301702
9Russia163768704
10Mexico19439504
11Japan3645553
12Ethiopia10000004
13Philippines2981703
14Egypt9954504
15Vietnam3100703
16DR Congo22670504
17Turkey7696304
18Iran16285504
19Germany3485603
20Thailand5108903
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 4\n", + "2 India 2973190 4\n", + "3 United States 9147420 4\n", + "4 Indonesia 1811570 4\n", + "5 Pakistan 770880 4\n", + "6 Brazil 8358140 4\n", + "7 Nigeria 910770 4\n", + "8 Bangladesh 130170 2\n", + "9 Russia 16376870 4\n", + "10 Mexico 1943950 4\n", + "11 Japan 364555 3\n", + "12 Ethiopia 1000000 4\n", + "13 Philippines 298170 3\n", + "14 Egypt 995450 4\n", + "15 Vietnam 310070 3\n", + "16 DR Congo 2267050 4\n", + "17 Turkey 769630 4\n", + "18 Iran 1628550 4\n", + "19 Germany 348560 3\n", + "20 Thailand 510890 3" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -300,9 +1905,196 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Country (or dependency)Land Area (Km²)Land Area (Km²)
no
1China9388211Giant
2India2973190Giant
3United States9147420Giant
4Indonesia1811570Giant
5Pakistan770880Giant
6Brazil8358140Giant
7Nigeria910770Giant
8Bangladesh130170Middle
9Russia16376870Giant
10Mexico1943950Giant
11Japan364555Big
12Ethiopia1000000Giant
13Philippines298170Big
14Egypt995450Giant
15Vietnam310070Big
16DR Congo2267050Giant
17Turkey769630Giant
18Iran1628550Giant
19Germany348560Big
20Thailand510890Big
\n", + "
" + ], + "text/plain": [ + " Country (or dependency) Land Area (Km²) Land Area (Km²)\n", + "no \n", + "1 China 9388211 Giant\n", + "2 India 2973190 Giant\n", + "3 United States 9147420 Giant\n", + "4 Indonesia 1811570 Giant\n", + "5 Pakistan 770880 Giant\n", + "6 Brazil 8358140 Giant\n", + "7 Nigeria 910770 Giant\n", + "8 Bangladesh 130170 Middle\n", + "9 Russia 16376870 Giant\n", + "10 Mexico 1943950 Giant\n", + "11 Japan 364555 Big\n", + "12 Ethiopia 1000000 Giant\n", + "13 Philippines 298170 Big\n", + "14 Egypt 995450 Giant\n", + "15 Vietnam 310070 Big\n", + "16 DR Congo 2267050 Giant\n", + "17 Turkey 769630 Giant\n", + "18 Iran 1628550 Giant\n", + "19 Germany 348560 Big\n", + "20 Thailand 510890 Big" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat(\n", " [\n", @@ -329,7 +2121,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -364,16 +2156,12 @@ "source": [ "#### Загрузка данных\n", "\n", - "За основу был взят набор данных \"Ecommerce Orders Data Set\" из Kaggle\n", - "\n", - "Используется только 100 первых заказов и связанные с ними объекты\n", - "\n", - "https://www.kaggle.com/datasets/sangamsharmait/ecommerce-orders-data-analysis" + "приведение даннык к нормальному виду\n" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -430,7 +2218,7 @@ " [234 rows x 3 columns])" ] }, - "execution_count": 32, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -481,7 +2269,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -506,7 +2294,7 @@ " No relationships" ] }, - "execution_count": 34, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -558,7 +2346,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -573,7 +2361,7 @@ " countries.Country (or dependency) -> capitals.Country/Territory" ] }, - "execution_count": 35, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -599,7 +2387,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -790,7 +2578,7 @@ "[235 rows x 7 columns]" ] }, - "execution_count": 36, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -816,7 +2604,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -831,7 +2619,7 @@ " ]" ] }, - "execution_count": 37, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -856,7 +2644,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -865,7 +2653,7 @@ "" ] }, - "execution_count": 38, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, @@ -893,7 +2681,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1141,16 +2929,16 @@ "29 Colombia 50882891 50000000" ] }, - "execution_count": 40, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries_norm = countries.copy()\n", - "\n", + "#заменяем все значения больше 50000000 на 50000000\n", "countries_norm[\"Population Clip\"] = countries_norm[\"Population 2020\"].clip(0, 50000000);\n", - "\n", + "#проверка результата\n", "countries_norm[countries_norm[\"Population 2020\"] > 50000000][\n", " [\"Country (or dependency)\", \"Population 2020\", \"Population Clip\"]\n", "]" @@ -1160,12 +2948,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Винсоризация признака Возраст" + "Винсоризация \n", + " * `winsorize()`: Функция для обработки выбросов с помощью Winsorization. \n", + " * `countries_norm[\"Population 2020\"].fillna(countries_norm[\"Population 2020\"].mean())`: Заменяет пропущенные значения в столбце \"Population 2020\" средним значением этого столбца.\n", + " * `(0, 0.05)`: Указывает, что нужно обработать как нижние, так и верхние выбросы. 0.05 означает, что 5% самых маленьких и 5% самых больших значений в столбце \"Population 2020\" будут заменены на значения 5-го и 95-го процентилей соответственно. \n", + " * `inplace=False`: Указывает, что `winsorize` не должен модифицировать исходный датафрейм `countries_norm` напрямую, а создать новый столбец с обработанными данными." ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1420,7 +3212,7 @@ "29 Colombia 50882891 50882891" ] }, - "execution_count": 41, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1450,7 +3242,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1628,7 +3420,7 @@ "[235 rows x 6 columns]" ] }, - "execution_count": 43, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1677,7 +3469,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1842,7 +3634,7 @@ "[235 rows x 5 columns]" ] }, - "execution_count": 44, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } diff --git a/lec4.ipynb b/lec4.ipynb new file mode 100644 index 0000000..1eeba17 --- /dev/null +++ b/lec4.ipynb @@ -0,0 +1,2524 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Загрузка набора данных" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceLevyManufacturerModelProd_yearCategoryLeather_interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbags
ID
45654403133281399LEXUSRX 4502010JeepYesHybrid3.5186005 km6.0Automatic4x404-MayLeft wheelSilver12
44731507166211018CHEVROLETEquinox2011JeepNoPetrol3192000 km6.0Tiptronic4x404-MayLeft wheelBlack8
457744198467-HONDAFIT2006HatchbackNoPetrol1.3200000 km4.0VariatorFront04-MayRight-hand driveBlack2
457691853607862FORDEscape2011JeepYesHybrid2.5168966 km4.0Automatic4x404-MayLeft wheelWhite0
4580926311726446HONDAFIT2014HatchbackYesPetrol1.391901 km4.0AutomaticFront04-MayLeft wheelSilver4
......................................................
457983558467-MERCEDES-BENZCLK 2001999CoupeYesCNG2.0 Turbo300000 km4.0ManualRear02-MarLeft wheelSilver5
4577885615681831HYUNDAISonata2011SedanYesPetrol2.4161600 km4.0TiptronicFront04-MayLeft wheelRed8
4580499726108836HYUNDAITucson2010JeepYesDiesel2116365 km4.0AutomaticFront04-MayLeft wheelGrey4
4579352653311288CHEVROLETCaptiva2007JeepYesDiesel251258 km4.0AutomaticFront04-MayLeft wheelBlack4
45813273470753HYUNDAISonata2012SedanYesHybrid2.4186923 km4.0AutomaticFront04-MayLeft wheelWhite12
\n", + "

19237 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " Price Levy Manufacturer Model Prod_year Category \\\n", + "ID \n", + "45654403 13328 1399 LEXUS RX 450 2010 Jeep \n", + "44731507 16621 1018 CHEVROLET Equinox 2011 Jeep \n", + "45774419 8467 - HONDA FIT 2006 Hatchback \n", + "45769185 3607 862 FORD Escape 2011 Jeep \n", + "45809263 11726 446 HONDA FIT 2014 Hatchback \n", + "... ... ... ... ... ... ... \n", + "45798355 8467 - MERCEDES-BENZ CLK 200 1999 Coupe \n", + "45778856 15681 831 HYUNDAI Sonata 2011 Sedan \n", + "45804997 26108 836 HYUNDAI Tucson 2010 Jeep \n", + "45793526 5331 1288 CHEVROLET Captiva 2007 Jeep \n", + "45813273 470 753 HYUNDAI Sonata 2012 Sedan \n", + "\n", + " Leather_interior Fuel type Engine volume Mileage Cylinders \\\n", + "ID \n", + "45654403 Yes Hybrid 3.5 186005 km 6.0 \n", + "44731507 No Petrol 3 192000 km 6.0 \n", + "45774419 No Petrol 1.3 200000 km 4.0 \n", + "45769185 Yes Hybrid 2.5 168966 km 4.0 \n", + "45809263 Yes Petrol 1.3 91901 km 4.0 \n", + "... ... ... ... ... ... \n", + "45798355 Yes CNG 2.0 Turbo 300000 km 4.0 \n", + "45778856 Yes Petrol 2.4 161600 km 4.0 \n", + "45804997 Yes Diesel 2 116365 km 4.0 \n", + "45793526 Yes Diesel 2 51258 km 4.0 \n", + "45813273 Yes Hybrid 2.4 186923 km 4.0 \n", + "\n", + " Gear box type Drive wheels Doors Wheel Color Airbags \n", + "ID \n", + "45654403 Automatic 4x4 04-May Left wheel Silver 12 \n", + "44731507 Tiptronic 4x4 04-May Left wheel Black 8 \n", + "45774419 Variator Front 04-May Right-hand drive Black 2 \n", + "45769185 Automatic 4x4 04-May Left wheel White 0 \n", + "45809263 Automatic Front 04-May Left wheel Silver 4 \n", + "... ... ... ... ... ... ... \n", + "45798355 Manual Rear 02-Mar Left wheel Silver 5 \n", + "45778856 Tiptronic Front 04-May Left wheel Red 8 \n", + "45804997 Automatic Front 04-May Left wheel Grey 4 \n", + "45793526 Automatic Front 04-May Left wheel Black 4 \n", + "45813273 Automatic Front 04-May Left wheel White 12 \n", + "\n", + "[19237 rows x 17 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "from sklearn import set_config\n", + "\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "random_state=9\n", + "\n", + "df = pd.read_csv(\"data/car_price_prediction.csv\", index_col=\"ID\")\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n", + "\n", + "Целевой признак -- gear box type - коробка переключения передач. x - полная выборка, y - gear box столбец\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceLevyManufacturerModelProd_yearCategoryLeather_interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbags
ID
457581531333289FORDEscape2008JeepYesHybrid0.4349288 km4.0AutomaticFront04-MayLeft wheelBlue0
4569993017249-FORDEscape Hybrid2008JeepNoHybrid2.3147000 km4.0Variator4x404-MayLeft wheelWhite8
4564656213331053LEXUSES 3502014SedanYesPetrol3.5179358 km6.0AutomaticFront04-MayLeft wheelRed12
4565692398791018MERCEDES-BENZML 3502011JeepYesDiesel3275862 km6.0Automatic4x404-MayLeft wheelSilver12
45815887109761275HYUNDAISonata2019SedanYesPetrol2.429419 km4.0AutomaticFront04-MayLeft wheelBlue12
......................................................
45802363218051024HYUNDAIH12010MinivanYesDiesel2.558958 km4.0AutomaticFront04-MayLeft wheelBlack4
458127772201327TOYOTACamry2018SedanYesPetrol2.547688 km4.0AutomaticFront04-MayLeft wheelBlue12
4410441715210-TOYOTAAqua2014HatchbackNoHybrid1.5139000 km4.0VariatorFront04-MayRight-hand driveWhite2
457934063136-OPELCorsa1995HatchbackNoPetrol1.4100000 km4.0ManualFront02-MarLeft wheelGrey2
4570070018817-TOYOTACamry2007SedanYesHybrid2.4151000 km4.0VariatorFront04-MayLeft wheelBlack10
\n", + "

15389 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " Price Levy Manufacturer Model Prod_year Category \\\n", + "ID \n", + "45758153 1333 289 FORD Escape 2008 Jeep \n", + "45699930 17249 - FORD Escape Hybrid 2008 Jeep \n", + "45646562 1333 1053 LEXUS ES 350 2014 Sedan \n", + "45656923 9879 1018 MERCEDES-BENZ ML 350 2011 Jeep \n", + "45815887 10976 1275 HYUNDAI Sonata 2019 Sedan \n", + "... ... ... ... ... ... ... \n", + "45802363 21805 1024 HYUNDAI H1 2010 Minivan \n", + "45812777 220 1327 TOYOTA Camry 2018 Sedan \n", + "44104417 15210 - TOYOTA Aqua 2014 Hatchback \n", + "45793406 3136 - OPEL Corsa 1995 Hatchback \n", + "45700700 18817 - TOYOTA Camry 2007 Sedan \n", + "\n", + " Leather_interior Fuel type Engine volume Mileage Cylinders \\\n", + "ID \n", + "45758153 Yes Hybrid 0.4 349288 km 4.0 \n", + "45699930 No Hybrid 2.3 147000 km 4.0 \n", + "45646562 Yes Petrol 3.5 179358 km 6.0 \n", + "45656923 Yes Diesel 3 275862 km 6.0 \n", + "45815887 Yes Petrol 2.4 29419 km 4.0 \n", + "... ... ... ... ... ... \n", + "45802363 Yes Diesel 2.5 58958 km 4.0 \n", + "45812777 Yes Petrol 2.5 47688 km 4.0 \n", + "44104417 No Hybrid 1.5 139000 km 4.0 \n", + "45793406 No Petrol 1.4 100000 km 4.0 \n", + "45700700 Yes Hybrid 2.4 151000 km 4.0 \n", + "\n", + " Gear box type Drive wheels Doors Wheel Color Airbags \n", + "ID \n", + "45758153 Automatic Front 04-May Left wheel Blue 0 \n", + "45699930 Variator 4x4 04-May Left wheel White 8 \n", + "45646562 Automatic Front 04-May Left wheel Red 12 \n", + "45656923 Automatic 4x4 04-May Left wheel Silver 12 \n", + "45815887 Automatic Front 04-May Left wheel Blue 12 \n", + "... ... ... ... ... ... ... \n", + "45802363 Automatic Front 04-May Left wheel Black 4 \n", + "45812777 Automatic Front 04-May Left wheel Blue 12 \n", + "44104417 Variator Front 04-May Right-hand drive White 2 \n", + "45793406 Manual Front 02-Mar Left wheel Grey 2 \n", + "45700700 Variator Front 04-May Left wheel Black 10 \n", + "\n", + "[15389 rows x 17 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_train'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Gear box type
ID
45758153Automatic
45699930Variator
45646562Automatic
45656923Automatic
45815887Automatic
......
45802363Automatic
45812777Automatic
44104417Variator
45793406Manual
45700700Variator
\n", + "

15389 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Gear box type\n", + "ID \n", + "45758153 Automatic\n", + "45699930 Variator\n", + "45646562 Automatic\n", + "45656923 Automatic\n", + "45815887 Automatic\n", + "... ...\n", + "45802363 Automatic\n", + "45812777 Automatic\n", + "44104417 Variator\n", + "45793406 Manual\n", + "45700700 Variator\n", + "\n", + "[15389 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'X_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceLevyManufacturerModelProd_yearCategoryLeather_interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbags
ID
45813151220919MERCEDES-BENZML 3502012JeepYesDiesel3209072 km6.0Automatic4x404-MayLeft wheelGrey12
4578374411000-JEEPLiberty2001JeepYesLPG3.7137582 km6.0Automatic4x404-MayRight-hand driveSilver6
4580585010976-TOYOTARAV 42002JeepYesCNG2200000 km4.0Automatic4x404-MayLeft wheelWhite4
458164091568753HYUNDAISonata2012SedanYesPetrol2.4246230 km4.0AutomaticFront04-MayLeft wheelBlack12
452812428938843TOYOTAPrius2008SedanNoHybrid1.5133016 km4.0AutomaticFront04-MayLeft wheelBeige8
......................................................
4579847813172639FORDFocus2014SedanYesPetrol2134400 km4.0TiptronicFront04-MayLeft wheelRed8
4532190916621-TOYOTAPrius2010HatchbackNoHybrid1.8154000 km4.0VariatorFront04-MayLeft wheelWhite6
45758118156811811LEXUSGX 4602010JeepYesPetrol4.6275240 km8.0Automatic4x404-MayLeft wheelSilver0
457581376476-NISSANNote2008HatchbackNoCNG1.5999999999 km4.0Automatic4x404-MayRight-hand driveBlack0
457204113697VOLKSWAGENJetta2015SedanYesPetrol1.8 Turbo65000 km4.0AutomaticFront04-MayLeft wheelGrey12
\n", + "

3848 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " Price Levy Manufacturer Model Prod_year Category \\\n", + "ID \n", + "45813151 220 919 MERCEDES-BENZ ML 350 2012 Jeep \n", + "45783744 11000 - JEEP Liberty 2001 Jeep \n", + "45805850 10976 - TOYOTA RAV 4 2002 Jeep \n", + "45816409 1568 753 HYUNDAI Sonata 2012 Sedan \n", + "45281242 8938 843 TOYOTA Prius 2008 Sedan \n", + "... ... ... ... ... ... ... \n", + "45798478 13172 639 FORD Focus 2014 Sedan \n", + "45321909 16621 - TOYOTA Prius 2010 Hatchback \n", + "45758118 15681 1811 LEXUS GX 460 2010 Jeep \n", + "45758137 6476 - NISSAN Note 2008 Hatchback \n", + "45720411 3 697 VOLKSWAGEN Jetta 2015 Sedan \n", + "\n", + " Leather_interior Fuel type Engine volume Mileage Cylinders \\\n", + "ID \n", + "45813151 Yes Diesel 3 209072 km 6.0 \n", + "45783744 Yes LPG 3.7 137582 km 6.0 \n", + "45805850 Yes CNG 2 200000 km 4.0 \n", + "45816409 Yes Petrol 2.4 246230 km 4.0 \n", + "45281242 No Hybrid 1.5 133016 km 4.0 \n", + "... ... ... ... ... ... \n", + "45798478 Yes Petrol 2 134400 km 4.0 \n", + "45321909 No Hybrid 1.8 154000 km 4.0 \n", + "45758118 Yes Petrol 4.6 275240 km 8.0 \n", + "45758137 No CNG 1.5 999999999 km 4.0 \n", + "45720411 Yes Petrol 1.8 Turbo 65000 km 4.0 \n", + "\n", + " Gear box type Drive wheels Doors Wheel Color Airbags \n", + "ID \n", + "45813151 Automatic 4x4 04-May Left wheel Grey 12 \n", + "45783744 Automatic 4x4 04-May Right-hand drive Silver 6 \n", + "45805850 Automatic 4x4 04-May Left wheel White 4 \n", + "45816409 Automatic Front 04-May Left wheel Black 12 \n", + "45281242 Automatic Front 04-May Left wheel Beige 8 \n", + "... ... ... ... ... ... ... \n", + "45798478 Tiptronic Front 04-May Left wheel Red 8 \n", + "45321909 Variator Front 04-May Left wheel White 6 \n", + "45758118 Automatic 4x4 04-May Left wheel Silver 0 \n", + "45758137 Automatic 4x4 04-May Right-hand drive Black 0 \n", + "45720411 Automatic Front 04-May Left wheel Grey 12 \n", + "\n", + "[3848 rows x 17 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'y_test'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Gear box type
ID
45813151Automatic
45783744Automatic
45805850Automatic
45816409Automatic
45281242Automatic
......
45798478Tiptronic
45321909Variator
45758118Automatic
45758137Automatic
45720411Automatic
\n", + "

3848 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Gear box type\n", + "ID \n", + "45813151 Automatic\n", + "45783744 Automatic\n", + "45805850 Automatic\n", + "45816409 Automatic\n", + "45281242 Automatic\n", + "... ...\n", + "45798478 Tiptronic\n", + "45321909 Variator\n", + "45758118 Automatic\n", + "45758137 Automatic\n", + "45720411 Automatic\n", + "\n", + "[3848 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from utils import split_stratified_into_train_val_test\n", + "\n", + "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", + " df, stratify_colname=\"Gear box type\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state\n", + ")\n", + "\n", + "display(\"X_train\", X_train)\n", + "display(\"y_train\", y_train)\n", + "\n", + "display(\"X_test\", X_test)\n", + "display(\"y_test\", y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В итоге, этот код выполняет следующие действия:\n", + "\n", + "* Заполняет пропущенные значения: В числовых столбцах медианой, в категориальных - значением \"unknown\".\n", + "* Стандартизирует числовые данные: приводит их к нулевому среднему и единичному стандартному отклонению.\n", + "* Преобразует категориальные данные: использует one-hot-кодирование.\n", + "* Удаляет ненужные столбцы: из списка `columns_to_drop`.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование конвейера для классификации данных\n", + "\n", + "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n", + "\n", + "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n", + "\n", + "features_preprocessing -- трансформер для предобработки признаков\n", + "\n", + "features_engineering -- трансформер для конструирования признаков\n", + "\n", + "drop_columns -- трансформер для удаления колонок\n", + "\n", + "features_postprocessing -- трансформер для унитарного кодирования новых признаков\n", + "\n", + "pipeline_end -- основной конвейер предобработки данных и конструирования признаков\n", + "\n", + "Конвейер выполняется последовательно.\n", + "\n", + "Трансформер выполняет параллельно для указанного набора колонок.\n", + "\n", + "Документация: \n", + "\n", + "https://scikit-learn.org/1.5/api/sklearn.pipeline.html\n", + "\n", + "https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.discriminant_analysis import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "from transformers import TitanicFeatures\n", + "\n", + "\n", + "#columns_to_drop = [\"Survived\", \"Name\", \"Cabin\", \"Ticket\", \"Embarked\", \"Parch\", \"Fare\"]\n", + "columns_to_drop = [\"Doors\", \"Color\", \"Gear box type\", \"Prod_year\", \"Mileage\", \"Airbags\", \"Levy\", \"Leather_interior\", \"Fuel type\", \"Drive wheels\"]\n", + "num_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype != \"object\"\n", + "]\n", + "cat_columns = [\n", + " column\n", + " for column in df.columns\n", + " if column not in columns_to_drop and df[column].dtype == \"object\"\n", + "]\n", + "\n", + "num_imputer = SimpleImputer(strategy=\"median\")\n", + "num_scaler = StandardScaler()\n", + "preprocessing_num = Pipeline(\n", + " [\n", + " (\"imputer\", num_imputer),\n", + " (\"scaler\", num_scaler),\n", + " ]\n", + ")\n", + "\n", + "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n", + "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n", + "preprocessing_cat = Pipeline(\n", + " [\n", + " (\"imputer\", cat_imputer),\n", + " (\"encoder\", cat_encoder),\n", + " ]\n", + ")\n", + "\n", + "features_preprocessing = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"prepocessing_num\", preprocessing_num, num_columns),\n", + " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n", + " #(\"prepocessing_features\", cat_imputer, [\"Name\", \"Cabin\"]),\n", + " ],\n", + " remainder=\"passthrough\"\n", + ")\n", + "\n", + "# features_engineering = ColumnTransformer(\n", + "# verbose_feature_names_out=False,\n", + "# transformers=[\n", + "# (\"add_features\", TitanicFeatures(), [\"Name\", \"Cabin\"]),\n", + "# ],\n", + "# remainder=\"passthrough\",\n", + "# )\n", + "\n", + "drop_columns = ColumnTransformer(\n", + " verbose_feature_names_out=False,\n", + " transformers=[\n", + " (\"drop_columns\", \"drop\", columns_to_drop),\n", + " ],\n", + " remainder=\"passthrough\",\n", + ")\n", + "\n", + "# features_postprocessing = ColumnTransformer(\n", + "# verbose_feature_names_out=False,\n", + "# transformers=[\n", + "# (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n", + "# ],\n", + "# remainder=\"passthrough\",\n", + "# )\n", + "\n", + "pipeline_end = Pipeline(\n", + " [\n", + " (\"features_preprocessing\", features_preprocessing),\n", + " # (\"features_engineering\", features_engineering),\n", + " (\"drop_columns\", drop_columns),\n", + " # (\"features_postprocessing\", features_postprocessing),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Демонстрация работы конвейера для предобработки данных при классификации" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceCylindersManufacturer_ALFA ROMEOManufacturer_ASTON MARTINManufacturer_AUDIManufacturer_BENTLEYManufacturer_BMWManufacturer_BUICKManufacturer_CADILLACManufacturer_CHEVROLET...Engine volume_5.7 TurboEngine volume_5.8Engine volume_5.9Engine volume_6Engine volume_6.2Engine volume_6.3Engine volume_6.3 TurboEngine volume_6.7Engine volume_6.8Wheel_Right-hand drive
ID
45758153-0.082497-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
45699930-0.007675-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
45646562-0.0824971.1870620.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
45656923-0.0423221.1870620.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
45815887-0.037165-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
458023630.013743-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
45812777-0.087729-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
44104417-0.017260-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.01.0
45793406-0.074021-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
45700700-0.000304-0.4850380.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

15389 rows × 1573 columns

\n", + "
" + ], + "text/plain": [ + " Price Cylinders Manufacturer_ALFA ROMEO \\\n", + "ID \n", + "45758153 -0.082497 -0.485038 0.0 \n", + "45699930 -0.007675 -0.485038 0.0 \n", + "45646562 -0.082497 1.187062 0.0 \n", + "45656923 -0.042322 1.187062 0.0 \n", + "45815887 -0.037165 -0.485038 0.0 \n", + "... ... ... ... \n", + "45802363 0.013743 -0.485038 0.0 \n", + "45812777 -0.087729 -0.485038 0.0 \n", + "44104417 -0.017260 -0.485038 0.0 \n", + "45793406 -0.074021 -0.485038 0.0 \n", + "45700700 -0.000304 -0.485038 0.0 \n", + "\n", + " Manufacturer_ASTON MARTIN Manufacturer_AUDI Manufacturer_BENTLEY \\\n", + "ID \n", + "45758153 0.0 0.0 0.0 \n", + "45699930 0.0 0.0 0.0 \n", + "45646562 0.0 0.0 0.0 \n", + "45656923 0.0 0.0 0.0 \n", + "45815887 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "45802363 0.0 0.0 0.0 \n", + "45812777 0.0 0.0 0.0 \n", + "44104417 0.0 0.0 0.0 \n", + "45793406 0.0 0.0 0.0 \n", + "45700700 0.0 0.0 0.0 \n", + "\n", + " Manufacturer_BMW Manufacturer_BUICK Manufacturer_CADILLAC \\\n", + "ID \n", + "45758153 0.0 0.0 0.0 \n", + "45699930 0.0 0.0 0.0 \n", + "45646562 0.0 0.0 0.0 \n", + "45656923 0.0 0.0 0.0 \n", + "45815887 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "45802363 0.0 0.0 0.0 \n", + "45812777 0.0 0.0 0.0 \n", + "44104417 0.0 0.0 0.0 \n", + "45793406 0.0 0.0 0.0 \n", + "45700700 0.0 0.0 0.0 \n", + "\n", + " Manufacturer_CHEVROLET ... Engine volume_5.7 Turbo \\\n", + "ID ... \n", + "45758153 0.0 ... 0.0 \n", + "45699930 0.0 ... 0.0 \n", + "45646562 0.0 ... 0.0 \n", + "45656923 0.0 ... 0.0 \n", + "45815887 0.0 ... 0.0 \n", + "... ... ... ... \n", + "45802363 0.0 ... 0.0 \n", + "45812777 0.0 ... 0.0 \n", + "44104417 0.0 ... 0.0 \n", + "45793406 0.0 ... 0.0 \n", + "45700700 0.0 ... 0.0 \n", + "\n", + " Engine volume_5.8 Engine volume_5.9 Engine volume_6 \\\n", + "ID \n", + "45758153 0.0 0.0 0.0 \n", + "45699930 0.0 0.0 0.0 \n", + "45646562 0.0 0.0 0.0 \n", + "45656923 0.0 0.0 0.0 \n", + "45815887 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "45802363 0.0 0.0 0.0 \n", + "45812777 0.0 0.0 0.0 \n", + "44104417 0.0 0.0 0.0 \n", + "45793406 0.0 0.0 0.0 \n", + "45700700 0.0 0.0 0.0 \n", + "\n", + " Engine volume_6.2 Engine volume_6.3 Engine volume_6.3 Turbo \\\n", + "ID \n", + "45758153 0.0 0.0 0.0 \n", + "45699930 0.0 0.0 0.0 \n", + "45646562 0.0 0.0 0.0 \n", + "45656923 0.0 0.0 0.0 \n", + "45815887 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "45802363 0.0 0.0 0.0 \n", + "45812777 0.0 0.0 0.0 \n", + "44104417 0.0 0.0 0.0 \n", + "45793406 0.0 0.0 0.0 \n", + "45700700 0.0 0.0 0.0 \n", + "\n", + " Engine volume_6.7 Engine volume_6.8 Wheel_Right-hand drive \n", + "ID \n", + "45758153 0.0 0.0 0.0 \n", + "45699930 0.0 0.0 0.0 \n", + "45646562 0.0 0.0 0.0 \n", + "45656923 0.0 0.0 0.0 \n", + "45815887 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "45802363 0.0 0.0 0.0 \n", + "45812777 0.0 0.0 0.0 \n", + "44104417 0.0 0.0 1.0 \n", + "45793406 0.0 0.0 0.0 \n", + "45700700 0.0 0.0 0.0 \n", + "\n", + "[15389 rows x 1573 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessing_result = pipeline_end.fit_transform(X_train)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "preprocessed_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Формирование набора моделей для классификации\n", + "\n", + "logistic -- логистическая регрессия\n", + "\n", + "ridge -- гребневая регрессия\n", + "\n", + "decision_tree -- дерево решений\n", + "\n", + "knn -- k-ближайших соседей\n", + "\n", + "naive_bayes -- наивный Байесовский классификатор\n", + "\n", + "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n", + "\n", + "random_forest -- метод случайного леса (набор деревьев решений)\n", + "\n", + "mlp -- многослойный персептрон (нейронная сеть)\n", + "\n", + "Документация: https://scikit-learn.org/1.5/supervised_learning.html" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n", + "\n", + "class_models = {\n", + " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n", + " # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n", + " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n", + " \"decision_tree\": {\n", + " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n", + " },\n", + " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n", + " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n", + " \"gradient_boosting\": {\n", + " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n", + " },\n", + " \"random_forest\": {\n", + " \"model\": ensemble.RandomForestClassifier(\n", + " max_depth=11, class_weight=\"balanced\", random_state=random_state\n", + " )\n", + " },\n", + " \"mlp\": {\n", + " \"model\": neural_network.MLPClassifier(\n", + " hidden_layer_sizes=(7,),\n", + " max_iter=100000,\n", + " early_stopping=True,\n", + " random_state=random_state,\n", + " )\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gear box type object\n", + "dtype: object\n", + "Gear box type object\n", + "dtype: object\n", + "\n", + "Index: 19237 entries, 45654403 to 45813273\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Price 19237 non-null int64 \n", + " 1 Levy 19237 non-null object \n", + " 2 Manufacturer 19237 non-null object \n", + " 3 Model 19237 non-null object \n", + " 4 Prod_year 19237 non-null int64 \n", + " 5 Category 19237 non-null object \n", + " 6 Leather_interior 19237 non-null object \n", + " 7 Fuel type 19237 non-null object \n", + " 8 Engine volume 19237 non-null object \n", + " 9 Mileage 19237 non-null object \n", + " 10 Cylinders 19237 non-null float64\n", + " 11 Gear box type 19237 non-null object \n", + " 12 Drive wheels 19237 non-null object \n", + " 13 Doors 19237 non-null object \n", + " 14 Wheel 19237 non-null object \n", + " 15 Color 19237 non-null object \n", + " 16 Airbags 19237 non-null int64 \n", + "dtypes: float64(1), int64(3), object(13)\n", + "memory usage: 2.6+ MB\n" + ] + } + ], + "source": [ + "print(y_train.dtypes)\n", + "print(y_test.dtypes)\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Обучение моделей на обучающем наборе данных и оценка на тестовом" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: logistic\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "c:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1, 3] during transform. These unknown categories will be encoded as all zeros\n", + " warnings.warn(\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Mix of label input types (string and number)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[16], line 28\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# class_models[model_name][\"Precision_train\"] = metrics.precision_score(\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# y_train, y_train_predict, average=\"micro\"\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# class_models[model_name][\"Precision_test\"] = metrics.precision_score(\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# y_test, y_test_predict\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m 25\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecall_train\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39mrecall_score(\n\u001b[0;32m 26\u001b[0m y_train, y_train_predict, average\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmicro\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 27\u001b[0m )\n\u001b[1;32m---> 28\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecall_test\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecall_score\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 29\u001b[0m \u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test_predict\u001b[49m\n\u001b[0;32m 30\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 31\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy_train\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39maccuracy_score(\n\u001b[0;32m 32\u001b[0m y_train, y_train_predict\n\u001b[0;32m 33\u001b[0m )\n\u001b[0;32m 34\u001b[0m class_models[model_name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy_test\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m metrics\u001b[38;5;241m.\u001b[39maccuracy_score(\n\u001b[0;32m 35\u001b[0m y_test, y_test_predict\n\u001b[0;32m 36\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:2385\u001b[0m, in \u001b[0;36mrecall_score\u001b[1;34m(y_true, y_pred, labels, pos_label, average, sample_weight, zero_division)\u001b[0m\n\u001b[0;32m 2217\u001b[0m \u001b[38;5;129m@validate_params\u001b[39m(\n\u001b[0;32m 2218\u001b[0m {\n\u001b[0;32m 2219\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my_true\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray-like\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msparse matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2244\u001b[0m zero_division\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwarn\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 2245\u001b[0m ):\n\u001b[0;32m 2246\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Compute the recall.\u001b[39;00m\n\u001b[0;32m 2247\u001b[0m \n\u001b[0;32m 2248\u001b[0m \u001b[38;5;124;03m The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2383\u001b[0m \u001b[38;5;124;03m array([1. , 1. , 0.5])\u001b[39;00m\n\u001b[0;32m 2384\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2385\u001b[0m _, r, _, _ \u001b[38;5;241m=\u001b[39m \u001b[43mprecision_recall_fscore_support\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2386\u001b[0m \u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2387\u001b[0m \u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2388\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2389\u001b[0m \u001b[43m \u001b[49m\u001b[43mpos_label\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpos_label\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2390\u001b[0m \u001b[43m \u001b[49m\u001b[43maverage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maverage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2391\u001b[0m \u001b[43m \u001b[49m\u001b[43mwarn_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecall\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2392\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2393\u001b[0m \u001b[43m \u001b[49m\u001b[43mzero_division\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mzero_division\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2394\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2395\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n", + "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:186\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 184\u001b[0m global_skip_validation \u001b[38;5;241m=\u001b[39m get_config()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip_parameter_validation\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m global_skip_validation:\n\u001b[1;32m--> 186\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 188\u001b[0m func_sig \u001b[38;5;241m=\u001b[39m signature(func)\n\u001b[0;32m 190\u001b[0m \u001b[38;5;66;03m# Map *args/**kwargs to the function signature\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1789\u001b[0m, in \u001b[0;36mprecision_recall_fscore_support\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight, zero_division)\u001b[0m\n\u001b[0;32m 1626\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute precision, recall, F-measure and support for each class.\u001b[39;00m\n\u001b[0;32m 1627\u001b[0m \n\u001b[0;32m 1628\u001b[0m \u001b[38;5;124;03mThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1786\u001b[0m \u001b[38;5;124;03m array([2, 2, 2]))\u001b[39;00m\n\u001b[0;32m 1787\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1788\u001b[0m _check_zero_division(zero_division)\n\u001b[1;32m-> 1789\u001b[0m labels \u001b[38;5;241m=\u001b[39m \u001b[43m_check_set_wise_labels\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maverage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpos_label\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1791\u001b[0m \u001b[38;5;66;03m# Calculate tp_sum, pred_sum, true_sum ###\u001b[39;00m\n\u001b[0;32m 1792\u001b[0m samplewise \u001b[38;5;241m=\u001b[39m average \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msamples\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1564\u001b[0m, in \u001b[0;36m_check_set_wise_labels\u001b[1;34m(y_true, y_pred, average, labels, pos_label)\u001b[0m\n\u001b[0;32m 1561\u001b[0m y_type, y_true, y_pred \u001b[38;5;241m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m 1562\u001b[0m \u001b[38;5;66;03m# Convert to Python primitive type to avoid NumPy type / Python str\u001b[39;00m\n\u001b[0;32m 1563\u001b[0m \u001b[38;5;66;03m# comparison. See https://github.com/numpy/numpy/issues/6784\u001b[39;00m\n\u001b[1;32m-> 1564\u001b[0m present_labels \u001b[38;5;241m=\u001b[39m \u001b[43munique_labels\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtolist()\n\u001b[0;32m 1565\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m average \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 1566\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\multiclass.py:114\u001b[0m, in \u001b[0;36munique_labels\u001b[1;34m(*ys)\u001b[0m\n\u001b[0;32m 112\u001b[0m \u001b[38;5;66;03m# Check that we don't mix string type with number type\u001b[39;00m\n\u001b[0;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mset\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(label, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m label \u001b[38;5;129;01min\u001b[39;00m ys_labels)) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 114\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMix of label input types (string and number)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m xp\u001b[38;5;241m.\u001b[39masarray(\u001b[38;5;28msorted\u001b[39m(ys_labels))\n", + "\u001b[1;31mValueError\u001b[0m: Mix of label input types (string and number)" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn import metrics\n", + "\n", + "for model_name in class_models.keys():\n", + " print(f\"Model: {model_name}\")\n", + " model = class_models[model_name][\"model\"]\n", + "\n", + " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n", + " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n", + "\n", + " y_train_predict = model_pipeline.predict(X_train)\n", + " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n", + " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n", + "\n", + " class_models[model_name][\"pipeline\"] = model_pipeline\n", + " class_models[model_name][\"probs\"] = y_test_probs\n", + " class_models[model_name][\"preds\"] = y_test_predict\n", + "\n", + " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n", + " y_train, y_train_predict, average=\"micro\"\n", + " )\n", + " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n", + " y_train, y_train_predict, average=\"micro\"\n", + " )\n", + " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n", + " y_train, y_train_predict\n", + " )\n", + " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n", + " y_test, y_test_probs\n", + " )\n", + " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n", + " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n", + " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n", + " y_test, y_test_predict\n", + " )\n", + " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n", + " y_test, y_test_predict\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Сводная таблица оценок качества для использованных моделей классификации\n", + "\n", + "Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Матрица неточностей" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import matplotlib.pyplot as plt\n", + "\n", + "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n", + "for index, key in enumerate(class_models.keys()):\n", + " c_matrix = class_models[key][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Died\", \"Sirvived\"]\n", + " ).plot(ax=ax.flat[index])\n", + " disp.ax_.set_title(key)\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Точность, полнота, верность (аккуратность), F-мера" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(\n", + " by=\"Accuracy_test\", ascending=False\n", + ").style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "]\n", + "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n", + "\n", + "display(best_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Вывод данных с ошибкой предсказания для оценки" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessing_result = pipeline_end.transform(X_test)\n", + "preprocessed_df = pd.DataFrame(\n", + " preprocessing_result,\n", + " columns=pipeline_end.get_feature_names_out(),\n", + ")\n", + "\n", + "y_pred = class_models[best_model][\"preds\"]\n", + "\n", + "error_index = y_test[y_test[\"Survived\"] != y_pred].index.tolist()\n", + "display(f\"Error items count: {len(error_index)}\")\n", + "\n", + "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n", + "error_df = X_test.loc[error_index].copy()\n", + "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n", + "error_df.sort_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Пример использования обученной модели (конвейера) для предсказания" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = class_models[best_model][\"pipeline\"]\n", + "\n", + "example_id = 450\n", + "test = pd.DataFrame(X_test.loc[example_id, :]).T\n", + "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n", + "display(test)\n", + "display(test_preprocessed)\n", + "result_proba = model.predict_proba(test)[0]\n", + "result = model.predict(test)[0]\n", + "real = int(y_test.loc[example_id].values[0])\n", + "display(f\"predicted: {result} (proba: {result_proba})\")\n", + "display(f\"real: {real}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Подбор гиперпараметров методом поиска по сетке\n", + "\n", + "https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "optimized_model_type = \"random_forest\"\n", + "\n", + "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n", + "\n", + "param_grid = {\n", + " \"model__n_estimators\": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],\n", + " \"model__max_features\": [\"sqrt\", \"log2\", 2],\n", + " \"model__max_depth\": [2, 3, 4, 5, 6, 7, 8, 9 ,10],\n", + " \"model__criterion\": [\"gini\", \"entropy\", \"log_loss\"],\n", + "}\n", + "\n", + "gs_optomizer = GridSearchCV(\n", + " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n", + ")\n", + "gs_optomizer.fit(X_train, y_train.values.ravel())\n", + "gs_optomizer.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Обучение модели с новыми гиперпараметрами" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_model = ensemble.RandomForestClassifier(\n", + " random_state=random_state,\n", + " criterion=\"gini\",\n", + " max_depth=7,\n", + " max_features=\"sqrt\",\n", + " n_estimators=30,\n", + ")\n", + "\n", + "result = {}\n", + "\n", + "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n", + "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n", + "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n", + "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n", + "\n", + "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n", + "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n", + "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n", + "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n", + "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n", + "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n", + "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n", + "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n", + "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n", + "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n", + "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n", + "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Формирование данных для оценки старой и новой версии модели" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=class_models[optimized_model_type]\n", + ")\n", + "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n", + " data=result\n", + ")\n", + "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n", + "optimized_metrics = optimized_metrics.set_index(\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценка параметров старой и новой модели" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " \"Accuracy_train\",\n", + " \"Accuracy_test\",\n", + " \"F1_train\",\n", + " \"F1_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Precision_train\",\n", + " \"Precision_test\",\n", + " \"Recall_train\",\n", + " \"Recall_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimized_metrics[\n", + " [\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " \"ROC_AUC_test\",\n", + " \"Cohen_kappa_test\",\n", + " \"MCC_test\",\n", + " ]\n", + "].style.background_gradient(\n", + " cmap=\"plasma\",\n", + " low=0.3,\n", + " high=1,\n", + " subset=[\n", + " \"ROC_AUC_test\",\n", + " \"MCC_test\",\n", + " \"Cohen_kappa_test\",\n", + " ],\n", + ").background_gradient(\n", + " cmap=\"viridis\",\n", + " low=1,\n", + " high=0.3,\n", + " subset=[\n", + " \"Accuracy_test\",\n", + " \"F1_test\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n", + ")\n", + "\n", + "for index in range(0, len(optimized_metrics)):\n", + " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n", + " disp = ConfusionMatrixDisplay(\n", + " confusion_matrix=c_matrix, display_labels=[\"Died\", \"Sirvived\"]\n", + " ).plot(ax=ax.flat[index])\n", + "\n", + "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/transformers.py b/transformers.py new file mode 100644 index 0000000..4c4342b --- /dev/null +++ b/transformers.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + + +class TitanicFeatures(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + def get_title(name) -> str: + return name.split(",")[1].split(".")[0].strip() + + def get_cabin_type(cabin) -> str: + if pd.isna(cabin): + return "unknown" + return cabin[0] + + X["Is_married"] = [1 if get_title(name) == "Mrs" else 0 for name in X["Name"]] + X["Cabin_type"] = [get_cabin_type(cabin) for cabin in X["Cabin"]] + return X + + def get_feature_names_out(self, features_in): + return np.append(features_in, ["Is_married", "Cabin_type"], axis=0) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..cb8c396 --- /dev/null +++ b/utils.py @@ -0,0 +1,79 @@ +from typing import Tuple + +import pandas as pd +from pandas import DataFrame +from sklearn.model_selection import train_test_split + + +def split_stratified_into_train_val_test( + df_input, + stratify_colname="y", + frac_train=0.6, + frac_val=0.15, + frac_test=0.25, + random_state=None, +) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]: + """ + Splits a Pandas dataframe into three subsets (train, val, and test) + following fractional ratios provided by the user, where each subset is + stratified by the values in a specific column (that is, each subset has + the same relative frequency of the values in the column). It performs this + splitting by running train_test_split() twice. + + Parameters + ---------- + df_input : Pandas dataframe + Input dataframe to be split. + stratify_colname : str + The name of the column that will be used for stratification. Usually + this column would be for the label. + frac_train : float + frac_val : float + frac_test : float + The ratios with which the dataframe will be split into train, val, and + test data. The values should be expressed as float fractions and should + sum to 1.0. + random_state : int, None, or RandomStateInstance + Value to be passed to train_test_split(). + + Returns + ------- + df_train, df_val, df_test : + Dataframes containing the three splits. + """ + + if frac_train + frac_val + frac_test != 1.0: + raise ValueError( + "fractions %f, %f, %f do not add up to 1.0" + % (frac_train, frac_val, frac_test) + ) + + if stratify_colname not in df_input.columns: + raise ValueError("%s is not a column in the dataframe" % (stratify_colname)) + + X = df_input # Contains all columns. + y = df_input[ + [stratify_colname] + ] # Dataframe of just the column on which to stratify. + + # Split original dataframe into train and temp dataframes. + df_train, df_temp, y_train, y_temp = train_test_split( + X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state + ) + + if frac_val <= 0: + assert len(df_input) == len(df_train) + len(df_temp) + return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp + + # Split the temp dataframe into val and test dataframes. + relative_frac_test = frac_test / (frac_val + frac_test) + df_val, df_test, y_val, y_test = train_test_split( + df_temp, + y_temp, + stratify=y_temp, + test_size=relative_frac_test, + random_state=random_state, + ) + + assert len(df_input) == len(df_train) + len(df_val) + len(df_test) + return df_train, df_val, df_test, y_train, y_val, y_test