MII/lec2.ipynb

839 lines
52 KiB
Plaintext
Raw Normal View History

2024-09-21 08:59:30 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 13,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
2024-10-24 21:50:12 +04:00
"Index: 235 entries, 1 to 235\n",
2024-09-21 08:59:30 +04:00
"Data columns (total 11 columns):\n",
2024-10-24 21:50:12 +04:00
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Country (or dependency) 235 non-null object\n",
" 1 Population 2020 235 non-null object\n",
" 2 Yearly Change 235 non-null object\n",
" 3 Net Change 235 non-null object\n",
" 4 Density(P/Km²) 235 non-null object\n",
" 5 Land Area (Km²) 235 non-null object\n",
" 6 Migrants (net) 201 non-null object\n",
" 7 Fert. Rate 235 non-null object\n",
" 8 MedAge 235 non-null object\n",
" 9 Urban Pop % 235 non-null object\n",
" 10 World Share 235 non-null object\n",
"dtypes: object(11)\n",
"memory usage: 22.0+ KB\n",
"(235, 11)\n"
2024-09-21 08:59:30 +04:00
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-10-24 21:50:12 +04:00
" <th>Country (or dependency)</th>\n",
" <th>Population 2020</th>\n",
" <th>Yearly Change</th>\n",
" <th>Net Change</th>\n",
" <th>Density(P/Km²)</th>\n",
" <th>Land Area (Km²)</th>\n",
" <th>Migrants (net)</th>\n",
" <th>Fert. Rate</th>\n",
" <th>MedAge</th>\n",
" <th>Urban Pop %</th>\n",
" <th>World Share</th>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>no</th>\n",
2024-09-21 08:59:30 +04:00
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
2024-10-24 21:50:12 +04:00
" <td>China</td>\n",
" <td>1,439,323,776</td>\n",
" <td>0.39%</td>\n",
" <td>5,540,090</td>\n",
" <td>153</td>\n",
" <td>9,388,211</td>\n",
" <td>-348,399</td>\n",
" <td>1.7</td>\n",
" <td>38</td>\n",
" <td>61%</td>\n",
" <td>18.47%</td>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-10-24 21:50:12 +04:00
" <td>India</td>\n",
" <td>1,380,004,385</td>\n",
" <td>0.99%</td>\n",
" <td>13,586,631</td>\n",
" <td>464</td>\n",
" <td>2,973,190</td>\n",
" <td>-532,687</td>\n",
" <td>2.2</td>\n",
" <td>28</td>\n",
" <td>35%</td>\n",
" <td>17.70%</td>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-10-24 21:50:12 +04:00
" <td>United States</td>\n",
" <td>331,002,651</td>\n",
" <td>0.59%</td>\n",
" <td>1,937,734</td>\n",
" <td>36</td>\n",
" <td>9,147,420</td>\n",
" <td>954,806</td>\n",
" <td>1.8</td>\n",
" <td>38</td>\n",
" <td>83%</td>\n",
" <td>4.25%</td>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-10-24 21:50:12 +04:00
" <td>Indonesia</td>\n",
" <td>273,523,615</td>\n",
" <td>1.07%</td>\n",
" <td>2,898,047</td>\n",
" <td>151</td>\n",
" <td>1,811,570</td>\n",
" <td>-98,955</td>\n",
" <td>2.3</td>\n",
" <td>30</td>\n",
" <td>56%</td>\n",
" <td>3.51%</td>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
2024-10-24 21:50:12 +04:00
" <td>Pakistan</td>\n",
" <td>220,892,340</td>\n",
" <td>2.00%</td>\n",
" <td>4,327,022</td>\n",
" <td>287</td>\n",
" <td>770,880</td>\n",
" <td>-233,379</td>\n",
" <td>3.6</td>\n",
" <td>23</td>\n",
" <td>35%</td>\n",
" <td>2.83%</td>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-10-24 21:50:12 +04:00
" Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
"no \n",
"1 China 1,439,323,776 0.39% 5,540,090 \n",
"2 India 1,380,004,385 0.99% 13,586,631 \n",
"3 United States 331,002,651 0.59% 1,937,734 \n",
"4 Indonesia 273,523,615 1.07% 2,898,047 \n",
"5 Pakistan 220,892,340 2.00% 4,327,022 \n",
2024-09-21 08:59:30 +04:00
"\n",
2024-10-24 21:50:12 +04:00
" Density(P/Km²) Land Area (Km²) Migrants (net) Fert. Rate MedAge \\\n",
"no \n",
"1 153 9,388,211 -348,399 1.7 38 \n",
"2 464 2,973,190 -532,687 2.2 28 \n",
"3 36 9,147,420 954,806 1.8 38 \n",
"4 151 1,811,570 -98,955 2.3 30 \n",
"5 287 770,880 -233,379 3.6 23 \n",
2024-09-21 08:59:30 +04:00
"\n",
2024-10-24 21:50:12 +04:00
" Urban Pop % World Share \n",
"no \n",
"1 61% 18.47% \n",
"2 35% 17.70% \n",
"3 83% 4.25% \n",
"4 56% 3.51% \n",
"5 35% 2.83% "
2024-09-21 08:59:30 +04:00
]
},
2024-10-24 21:50:12 +04:00
"execution_count": 13,
2024-09-21 08:59:30 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
2024-10-24 21:50:12 +04:00
"df = pd.read_csv(\"data/population.csv\", index_col=\"no\")\n",
2024-09-21 08:59:30 +04:00
"\n",
"df.info()\n",
"\n",
"print(df.shape)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Типы пропущенных данных:\n",
"- None - представление пустых данных в Python\n",
"- NaN - представление пустых данных в Pandas\n",
"- '' - пустая строка"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 6,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-10-24 21:50:12 +04:00
"Country (or dependency) 0\n",
"Population 2020 0\n",
"Yearly Change 0\n",
"Net Change 0\n",
"Density (P/Km²) 0\n",
"Land Area (Km²) 0\n",
"Migrants (net) 34\n",
"Fert. Rate 0\n",
"Med. Age 0\n",
"Urban Pop % 0\n",
"World Share 0\n",
2024-09-21 08:59:30 +04:00
"dtype: int64\n",
"\n",
2024-10-24 21:50:12 +04:00
"Country (or dependency) False\n",
"Population 2020 False\n",
"Yearly Change False\n",
"Net Change False\n",
"Density (P/Km²) False\n",
"Land Area (Km²) False\n",
"Migrants (net) True\n",
"Fert. Rate False\n",
"Med. Age False\n",
"Urban Pop % False\n",
"World Share False\n",
2024-09-21 08:59:30 +04:00
"dtype: bool\n",
"\n",
2024-10-24 21:50:12 +04:00
"Migrants (net) процент пустых значений: %14.47\n"
2024-09-21 08:59:30 +04:00
]
}
],
"source": [
2024-10-24 21:50:12 +04:00
"\n",
"\n",
2024-09-21 08:59:30 +04:00
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполнение пропущенных данных\n",
"\n",
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
"\n",
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 11,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-10-24 21:50:12 +04:00
"(235, 12)\n",
"Country (or dependency) False\n",
"Population 2020 False\n",
"Yearly Change False\n",
"Net Change False\n",
"Density (P/Km²) False\n",
"Land Area (Km²) False\n",
"Migrants (net) False\n",
"Fert. Rate False\n",
"Med. Age False\n",
"Urban Pop % False\n",
"World Share False\n",
"MigrantsFill False\n",
2024-09-21 08:59:30 +04:00
"dtype: bool\n"
]
},
{
2024-10-24 21:50:12 +04:00
"ename": "TypeError",
"evalue": "Cannot convert ['-348,399' '-532,687' '954,806' '-98,955' '-233,379' '21,200' '-60,000'\n '-369,501' '182,456' '-60,000' '71,560' '30,000' '-67,152' '-38,033'\n '-80,000' '23,861' '283,922' '-55,000' '543,822' '19,444' '260,650'\n '36,527' '148,943' '-40,076' '145,405' '-163,313' '-10,000' '11,731'\n '204,796' '40,000' '168,694' '4,800' '-10,000' '-50,000' '10,000' '7,834'\n '-62,920' '-29,395' '242,032' '-51,419' '134,979' '-8,863' '99,069'\n '6,413' '50,000' '-5,000' '-10,000' '-30,000' '41,710' '-653,249'\n '-1,500' '-4,800' '-8,000' '-5,403' '158,246' '4,000' '30,001' '-97,986'\n '-25,000' '-40,000' '-73,999' '-16,053' '111,708' '-18,000' '-8,000'\n '-9,215' '36,400' '-427,391' '16,000' '-20,000' '-30,000' '2,000'\n '-40,000' '-116,858' '-4,000' '-9,000' '-2,000' '2,001' '-4,000' '-9,504'\n '48,000' '-35,000' '-14,400' '-174,200' '-30,000' '22,011' '-16,000'\n '10,220' '-6,000' '1,200' '40,000' '-6,800' '40,000' '6,000' '-20,000'\n '8,730' '65,000' '-800' '4,000' '10,000' '52,000' '-2,000' '-4,200'\n '29,308' '-14,704' '-16,556' '-4,800' '-1,999' '-30,012' '-21,272'\n '-4,000' '-40,539' '-5,000' '27,028' '15,200' '14,000' '-4,000' '1,485'\n '28,000' '87,400' '-10,563' '4,200' '-5,000' '23,604' '-40,000' '14,881'\n '5,000' '11,200' '39,520' '-8,001' '-1,387' '-10,000' '-39,858' '-3,000'\n '-21,585' '-852' '-4,998' '-11,332' '40,000' '-14,000' '-97,986'\n '-32,780' '-4,806' '-3,087' '3,000' '3,260' '-10,047' '-1,000' '2,000'\n '-1,399' '-14,837' '47,800' '16,000' '-800' '3,911' '-5,385' '0' '5,000'\n '-8,353' '900' '-6,202' '-1,256' '-2,000' '-6,000' '320' '-1,600' '5,000'\n '-480' '9,741' '5,582' '-1,000' '-1,342' '-2,957' '11,370' '900' '0'\n '-1,440' '1,200' '1,000' '-960' '380' '120' '1,200' '-79' '502' '-1,000'\n '0' '-1,680' '-2,803' '0' '1,351' '-506' '515' '-800' '-200' '-200' '201'\n '-800' '-451' '-200' '0' nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan] to numeric",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[11], line 11\u001b[0m\n\u001b[0;32m 8\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrantsFill\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrants (net)\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;66;03m# Замена пустых данных на медиану\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrantsMedian\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMigrants (net)\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMigrants (net)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmedian\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 13\u001b[0m df\u001b[38;5;241m.\u001b[39mtail()\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:6559\u001b[0m, in \u001b[0;36mSeries.median\u001b[1;34m(self, axis, skipna, numeric_only, **kwargs)\u001b[0m\n\u001b[0;32m 6551\u001b[0m \u001b[38;5;129m@doc\u001b[39m(make_doc(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmedian\u001b[39m\u001b[38;5;124m\"\u001b[39m, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m 6552\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmedian\u001b[39m(\n\u001b[0;32m 6553\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 6557\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 6558\u001b[0m ):\n\u001b[1;32m-> 6559\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mNDFrame\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmedian\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:12431\u001b[0m, in \u001b[0;36mNDFrame.median\u001b[1;34m(self, axis, skipna, numeric_only, **kwargs)\u001b[0m\n\u001b[0;32m 12424\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmedian\u001b[39m(\n\u001b[0;32m 12425\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 12426\u001b[0m axis: Axis \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12429\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 12430\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Series \u001b[38;5;241m|\u001b[39m \u001b[38;5;28mfloat\u001b[39m:\n\u001b[1;32m> 12431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_stat_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 12432\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedian\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnanops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnanmedian\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 12433\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:12377\u001b[0m, in \u001b[0;36mNDFrame._stat_function\u001b[1;34m(self, name, func, axis, skipna, numeric_only, **kwargs)\u001b[0m\n\u001b[0;32m 12373\u001b[0m nv\u001b[38;5;241m.\u001b[39mvalidate_func(name, (), kwargs)\n\u001b[0;32m 12375\u001b[0m validate_bool_kwarg(skipna, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskipna\u001b[39m\u001b[38;5;124m\"\u001b[39m, none_allowed\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m> 12377\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reduce\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 12378\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnumeric_only\u001b[49m\n\u001b[0;32m 12379\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:6457\u001b[0m, in \u001b[0;36mSeries._reduce\u001b[1;34m(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)\u001b[0m\n\u001b[0;32m 6452\u001b[0m \u001b[38;5;66;03m# GH#47500 - change to TypeError to match other methods\u001b[39;00m\n\u001b[0;32m 6453\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[0;32m 6454\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSeries.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not allow \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkwd_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnumeric_only\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 6455\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwith non-numeric dtypes.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 6456\u001b[0m )\n\u001b[1;32m-> 6457\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelegate\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\nanops.py:147\u001b[0m, in \u001b[0;36mbottleneck_switch.__call__.<locals>.f\u001b[1;34m(values, axis, skipna, **kwds)\u001b[0m\n\u001b[0;32m 145\u001b[0m result \u001b[38;5;241m=\u001b[39m alt(values, axis\u001b[38;5;241m=\u001b[39maxis, skipna\u001b[38;5;241m=\u001b[39mskipna, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[0;32m 146\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 147\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43malt\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\pandas\\core\\nanops.py:787\u001b[0m, in \u001b[0;36mnanmedian\u001b[1;34m(values, axis, skipna, mask)\u001b[0m\n\u001b[0;32m 785\u001b[0m inferred \u001b[38;5;241m=\u001b[39m lib\u001b[38;5;241m.\u001b[39minfer_dtype(values)\n\u001b[0;32m 786\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inferred \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstring\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmixed\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m--> 787\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot convert \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalues\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m to numeric\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 788\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 789\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[1;31mTypeError\u001b[0m: Cannot convert ['-348,399' '-532,687' '954,806' '-98,955' '-233,379' '21,200' '-60,000'\n '-369,501' '182,456' '-60,000' '71,560' '30,000' '-67,152' '-38,033'\n '-80,000' '23,861' '283,922' '-55,000' '543,822' '19,444' '260,650'\n '36,527' '148,943' '-40,076' '145,405' '-163,313' '-10,000' '11,731'\n '204,796' '40,000' '168,694' '4,800' '-10,000' '-50,000' '10,000' '7,834'\n '-62,920' '-29,395' '242,032' '-51,419' '134,979' '-8,863' '99,069'\n '6,413' '50,000' '-5,000' '-10,000' '-30,000' '41,710' '-653,249'\n '-1,500' '-4,800' '-8,000' '-5,403' '158,246' '4,000' '30,001' '-97,986'\n '-25,000' '-40,000' '-73,999' '-16,053' '111,708' '-18,000' '-8,000'\n '-9,215' '36,400' '-427,391' '16,000' '-20,000' '-30,000' '2,000'\n '-40,000' '-116,858' '-4,000' '-9,000' '-2,000' '2,001' '-4,000' '-9,504'\n '48,000' '-35,000' '-14,400' '-174,200' '-30,000' '22,011' '-16,000'\n '10,220' '-6,000' '1,200' '40,000' '-6,800' '40,000' '6,000' '-20,000'\n '8,730' '65,000' '-800' '4,000' '10,000' '52,000' '-2,000' '-4,200'\n '29,308' '-14,704' '-16,556' '-4,800' '-1,999' '-30,012' '-21,272'\n '-4,000' '-40,539' '-5,000' '27,028' '15,200' '14,000' '-4,000' '1,485'\n '28,000' '87,400' '-10,563' '4,200' '-5,000' '23,604' '-40,000' '14,881'\n '5,000' '11,200' '39,520' '-8,001' '-1,387' '-10,000' '-39,858' '-3,000'\n '-21,585' '-852' '-4,998' '-11,332' '40,000' '-14,000' '-97,986'\n '-32,780' '-4,806' '-3,087' '3,000' '3,260' '-10,047' '-1,000' '2,000'\n '-1,399' '-14,837' '47,800' '16,000' '-800' '3,911' '-5,385' '0' '5,000'\n '-8,353' '900' '-6,202' '-1,256' '-2,000' '-6,000' '320' '-1,600' '5,000'\n '-480' '9,741' '5,582' '-1,000' '-1,342' '-2,957' '11,370' '900' '0'\n '-1,440' '1,200' '1,000' '-960' '380' '120' '1,200' '-79' '502' '-1,000'\n '0' '-1,680' '-2,803' '0' '1,351' '-506' '515' '-800' '-200' '-200' '201'\n '-800' '-451' '-200' '0' nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan\n nan nan nan nan] to numeric"
]
2024-09-21 08:59:30 +04:00
}
],
"source": [
"fillna_df = df.fillna(0)\n",
"\n",
2024-10-24 21:50:12 +04:00
"print(fillna_df.shape) # размеры\n",
2024-09-21 08:59:30 +04:00
"\n",
"print(fillna_df.isnull().any())\n",
"\n",
"# Замена пустых данных на 0\n",
2024-10-24 21:50:12 +04:00
"df[\"MigrantsFill\"] = df[\"Migrants (net)\"].fillna(0)\n",
2024-09-21 08:59:30 +04:00
"\n",
"# Замена пустых данных на медиану\n",
2024-10-24 21:50:12 +04:00
"df[\"MigrantsMedian\"] = df[\"Migrants (net)\"].fillna(df[\"Migrants (net)\"].median())\n",
2024-09-21 08:59:30 +04:00
"\n",
"df.tail()"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 13,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-10-24 21:50:12 +04:00
" <th>Country (or dependency)</th>\n",
" <th>Population 2020</th>\n",
" <th>Yearly Change</th>\n",
" <th>Net Change</th>\n",
" <th>Density (P/Km²)</th>\n",
" <th>Land Area (Km²)</th>\n",
" <th>Migrants (net)</th>\n",
" <th>Fert. Rate</th>\n",
" <th>Med. Age</th>\n",
" <th>Urban Pop %</th>\n",
" <th>World Share</th>\n",
" <th>MigrantsFill</th>\n",
" <th>MigrantsCopy</th>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>no</th>\n",
2024-09-21 08:59:30 +04:00
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>231</th>\n",
" <td>Montserrat</td>\n",
" <td>4,992</td>\n",
" <td>0.06%</td>\n",
" <td>3</td>\n",
" <td>50</td>\n",
" <td>100</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>10%</td>\n",
" <td>0.00%</td>\n",
2024-09-21 08:59:30 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>232</th>\n",
" <td>Falkland Islands</td>\n",
" <td>3,480</td>\n",
" <td>3.05%</td>\n",
" <td>103</td>\n",
" <td>0</td>\n",
" <td>12,170</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>66%</td>\n",
" <td>0.00%</td>\n",
2024-09-21 08:59:30 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>233</th>\n",
" <td>Niue</td>\n",
" <td>1,626</td>\n",
" <td>0.68%</td>\n",
" <td>11</td>\n",
" <td>6</td>\n",
" <td>260</td>\n",
2024-09-21 08:59:30 +04:00
" <td>NaN</td>\n",
2024-10-24 21:50:12 +04:00
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>46%</td>\n",
" <td>0.00%</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-09-21 08:59:30 +04:00
" </tr>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>234</th>\n",
" <td>Tokelau</td>\n",
" <td>1,357</td>\n",
" <td>1.27%</td>\n",
" <td>17</td>\n",
" <td>136</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>0%</td>\n",
" <td>0.00%</td>\n",
2024-09-21 08:59:30 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
2024-10-24 21:50:12 +04:00
" <th>235</th>\n",
" <td>Holy See</td>\n",
" <td>801</td>\n",
" <td>0.25%</td>\n",
" <td>2</td>\n",
" <td>2,003</td>\n",
2024-09-21 08:59:30 +04:00
" <td>0</td>\n",
2024-10-24 21:50:12 +04:00
" <td>NaN</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>N.A.</td>\n",
" <td>0.00%</td>\n",
2024-09-21 08:59:30 +04:00
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-10-24 21:50:12 +04:00
" Country (or dependency) Population 2020 Yearly Change Net Change \\\n",
"no \n",
"231 Montserrat 4,992 0.06% 3 \n",
"232 Falkland Islands 3,480 3.05% 103 \n",
"233 Niue 1,626 0.68% 11 \n",
"234 Tokelau 1,357 1.27% 17 \n",
"235 Holy See 801 0.25% 2 \n",
2024-09-21 08:59:30 +04:00
"\n",
2024-10-24 21:50:12 +04:00
" Density (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age \\\n",
"no \n",
"231 50 100 NaN N.A. N.A. \n",
"232 0 12,170 NaN N.A. N.A. \n",
"233 6 260 NaN N.A. N.A. \n",
"234 136 10 NaN N.A. N.A. \n",
"235 2,003 0 NaN N.A. N.A. \n",
2024-09-21 08:59:30 +04:00
"\n",
2024-10-24 21:50:12 +04:00
" Urban Pop % World Share MigrantsFill MigrantsCopy \n",
"no \n",
"231 10% 0.00% 0 0 \n",
"232 66% 0.00% 0 0 \n",
"233 46% 0.00% 0 0 \n",
"234 0% 0.00% 0 0 \n",
"235 N.A. 0.00% 0 0 "
2024-09-21 08:59:30 +04:00
]
},
2024-10-24 21:50:12 +04:00
"execution_count": 13,
2024-09-21 08:59:30 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-10-24 21:50:12 +04:00
"df[\"MigrantsCopy\"] = df[\"Migrants (net)\"]\n",
2024-09-21 08:59:30 +04:00
"\n",
"# Замена данных сразу в DataFrame без копирования\n",
2024-10-24 21:50:12 +04:00
"df.fillna({\"MigrantsCopy\": 0}, inplace=True)\n",
2024-09-21 08:59:30 +04:00
"\n",
"df.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление наблюдений с пропусками"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 14,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-10-24 21:50:12 +04:00
"(201, 13)\n",
"Country (or dependency) False\n",
"Population 2020 False\n",
"Yearly Change False\n",
"Net Change False\n",
"Density (P/Km²) False\n",
"Land Area (Km²) False\n",
"Migrants (net) False\n",
"Fert. Rate False\n",
"Med. Age False\n",
"Urban Pop % False\n",
"World Share False\n",
"MigrantsFill False\n",
2024-09-21 08:59:30 +04:00
"dtype: bool\n"
]
}
],
"source": [
"dropna_df = df.dropna()\n",
"\n",
"print(dropna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных\n",
"\n",
"Библиотека scikit-learn\n",
"\n",
"https://scikit-learn.org/stable/index.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 16,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [],
"source": [
"# Функция для создания выборок\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 17,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-10-24 21:50:12 +04:00
"MedAge\n",
"N.A. 34\n",
"19 14\n",
"28 12\n",
"43 11\n",
"32 11\n",
"42 10\n",
"18 10\n",
"20 9\n",
"30 8\n",
"38 7\n",
"26 7\n",
"40 7\n",
"22 7\n",
"31 6\n",
"34 6\n",
"24 6\n",
"17 6\n",
"44 5\n",
"29 5\n",
"41 5\n",
"33 5\n",
"21 5\n",
"45 5\n",
"23 4\n",
"37 4\n",
"36 4\n",
"25 4\n",
"27 4\n",
"39 3\n",
"46 3\n",
"35 3\n",
"47 2\n",
"48 1\n",
"15 1\n",
"16 1\n",
2024-09-21 08:59:30 +04:00
"Name: count, dtype: int64\n"
]
2024-10-24 21:50:12 +04:00
},
{
"ename": "ValueError",
"evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[17], line 6\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(df\u001b[38;5;241m.\u001b[39mMedAge\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[0;32m 4\u001b[0m data \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMedAge\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFert. Rate\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDensity(P/Km²)\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m----> 6\u001b[0m df_train, df_val, df_test \u001b[38;5;241m=\u001b[39m \u001b[43msplit_stratified_into_train_val_test\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mstratify_colname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMedAge\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.60\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrac_val\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrac_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.20\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 12\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_train\u001b[38;5;241m.\u001b[39mMedAge\u001b[38;5;241m.\u001b[39mvalue_counts())\n",
"Cell \u001b[1;32mIn[16], line 57\u001b[0m, in \u001b[0;36msplit_stratified_into_train_val_test\u001b[1;34m(df_input, stratify_colname, frac_train, frac_val, frac_test, random_state)\u001b[0m\n\u001b[0;32m 52\u001b[0m y \u001b[38;5;241m=\u001b[39m df_input[\n\u001b[0;32m 53\u001b[0m [stratify_colname]\n\u001b[0;32m 54\u001b[0m ] \u001b[38;5;66;03m# Dataframe of just the column on which to stratify.\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;66;03m# Split original dataframe into train and temp dataframes.\u001b[39;00m\n\u001b[1;32m---> 57\u001b[0m df_train, df_temp, y_train, y_temp \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfrac_train\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 59\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 61\u001b[0m \u001b[38;5;66;03m# Split the temp dataframe into val and test dataframes.\u001b[39;00m\n\u001b[0;32m 62\u001b[0m relative_frac_test \u001b[38;5;241m=\u001b[39m frac_test \u001b[38;5;241m/\u001b[39m (frac_val \u001b[38;5;241m+\u001b[39m frac_test)\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2806\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m 2802\u001b[0m CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[0;32m 2804\u001b[0m cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m-> 2806\u001b[0m train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2808\u001b[0m train, test \u001b[38;5;241m=\u001b[39m ensure_common_namespace_device(arrays[\u001b[38;5;241m0\u001b[39m], train, test)\n\u001b[0;32m 2810\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m 2811\u001b[0m chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[0;32m 2812\u001b[0m (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[0;32m 2813\u001b[0m )\n\u001b[0;32m 2814\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:1843\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 1813\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[0;32m 1814\u001b[0m \n\u001b[0;32m 1815\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1840\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[0;32m 1841\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1842\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[1;32m-> 1843\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iter_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m 1844\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\1\\Desktop\\улгту\\3 курс\\МИИ\\mai\\.venv\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:2252\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[1;34m(self, X, y, groups)\u001b[0m\n\u001b[0;32m 2250\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m-> 2252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2255\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2256\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2257\u001b[0m )\n\u001b[0;32m 2259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[0;32m 2260\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 2261\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 2262\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[0;32m 2263\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
]
2024-09-21 08:59:30 +04:00
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
2024-10-24 21:50:12 +04:00
"print(df.MedAge.value_counts())\n",
2024-09-21 08:59:30 +04:00
"\n",
2024-10-24 21:50:12 +04:00
"data = df[[\"MedAge\", \"Fert. Rate\", \"Density(P/Km²)\"]].copy()\n",
2024-09-21 08:59:30 +04:00
"\n",
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
2024-10-24 21:50:12 +04:00
" data,\n",
" stratify_colname=\"MedAge\",\n",
" frac_train=0.60,\n",
" frac_val=0.20,\n",
" frac_test=0.20,\n",
2024-09-21 08:59:30 +04:00
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
2024-10-24 21:50:12 +04:00
"print(df_train.MedAge.value_counts())\n",
2024-09-21 08:59:30 +04:00
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
2024-10-24 21:50:12 +04:00
"print(df_val.MedAge.value_counts())\n",
2024-09-21 08:59:30 +04:00
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
2024-10-24 21:50:12 +04:00
"print(df_test.MedAge.value_counts())"
2024-09-21 08:59:30 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка с избытком (oversampling)\n",
"\n",
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
"\n",
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
"\n",
"Выборка с недостатком (undersampling)\n",
"\n",
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
"\n",
"Библиотека imbalanced-learn\n",
"\n",
"https://imbalanced-learn.org/stable/"
]
},
{
"cell_type": "code",
2024-10-24 21:50:12 +04:00
"execution_count": 1,
2024-09-21 08:59:30 +04:00
"metadata": {},
"outputs": [
{
2024-10-24 21:50:12 +04:00
"ename": "ModuleNotFoundError",
"evalue": "No module named 'imblearn'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mimblearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mover_sampling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ADASYN\n\u001b[0;32m 3\u001b[0m ada \u001b[38;5;241m=\u001b[39m ADASYN()\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, df_train\u001b[38;5;241m.\u001b[39mshape)\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'imblearn'"
2024-09-21 08:59:30 +04:00
]
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Pclass.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.Pclass.value_counts())\n",
"\n",
"df_train_adasyn"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-10-24 21:50:12 +04:00
"version": "3.12.5"
2024-09-21 08:59:30 +04:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}