960 lines
31 KiB
Plaintext
960 lines
31 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Загрузка данных в DataFrame"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 891 entries, 1 to 891\n",
|
||
"Data columns (total 11 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Survived 891 non-null int64 \n",
|
||
" 1 Pclass 891 non-null int64 \n",
|
||
" 2 Name 891 non-null object \n",
|
||
" 3 Sex 891 non-null object \n",
|
||
" 4 Age 714 non-null float64\n",
|
||
" 5 SibSp 891 non-null int64 \n",
|
||
" 6 Parch 891 non-null int64 \n",
|
||
" 7 Ticket 891 non-null object \n",
|
||
" 8 Fare 891 non-null float64\n",
|
||
" 9 Cabin 204 non-null object \n",
|
||
" 10 Embarked 889 non-null object \n",
|
||
"dtypes: float64(2), int64(4), object(5)\n",
|
||
"memory usage: 83.5+ KB\n",
|
||
"(891, 11)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>PassengerId</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Braund, Mr. Owen Harris</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>A/5 21171</td>\n",
|
||
" <td>7.2500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>PC 17599</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" <td>C85</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Heikkinen, Miss. Laina</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>STON/O2. 3101282</td>\n",
|
||
" <td>7.9250</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>113803</td>\n",
|
||
" <td>53.1000</td>\n",
|
||
" <td>C123</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Allen, Mr. William Henry</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>373450</td>\n",
|
||
" <td>8.0500</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass \\\n",
|
||
"PassengerId \n",
|
||
"1 0 3 \n",
|
||
"2 1 1 \n",
|
||
"3 1 3 \n",
|
||
"4 1 1 \n",
|
||
"5 0 3 \n",
|
||
"\n",
|
||
" Name Sex Age \\\n",
|
||
"PassengerId \n",
|
||
"1 Braund, Mr. Owen Harris male 22.0 \n",
|
||
"2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n",
|
||
"3 Heikkinen, Miss. Laina female 26.0 \n",
|
||
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n",
|
||
"5 Allen, Mr. William Henry male 35.0 \n",
|
||
"\n",
|
||
" SibSp Parch Ticket Fare Cabin Embarked \n",
|
||
"PassengerId \n",
|
||
"1 1 0 A/5 21171 7.2500 NaN S \n",
|
||
"2 1 0 PC 17599 71.2833 C85 C \n",
|
||
"3 0 0 STON/O2. 3101282 7.9250 NaN S \n",
|
||
"4 1 0 113803 53.1000 C123 S \n",
|
||
"5 0 0 373450 8.0500 NaN S "
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"data/titanic.csv\", index_col=\"PassengerId\")\n",
|
||
"\n",
|
||
"df.info()\n",
|
||
"\n",
|
||
"print(df.shape)\n",
|
||
"\n",
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Получение сведений о пропущенных данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Типы пропущенных данных:\n",
|
||
"- None - представление пустых данных в Python\n",
|
||
"- NaN - представление пустых данных в Pandas\n",
|
||
"- '' - пустая строка"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Survived 0\n",
|
||
"Pclass 0\n",
|
||
"Name 0\n",
|
||
"Sex 0\n",
|
||
"Age 177\n",
|
||
"SibSp 0\n",
|
||
"Parch 0\n",
|
||
"Ticket 0\n",
|
||
"Fare 0\n",
|
||
"Cabin 687\n",
|
||
"Embarked 2\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Survived False\n",
|
||
"Pclass False\n",
|
||
"Name False\n",
|
||
"Sex False\n",
|
||
"Age True\n",
|
||
"SibSp False\n",
|
||
"Parch False\n",
|
||
"Ticket False\n",
|
||
"Fare False\n",
|
||
"Cabin True\n",
|
||
"Embarked True\n",
|
||
"dtype: bool\n",
|
||
"\n",
|
||
"Age процент пустых значений: %19.87\n",
|
||
"Cabin процент пустых значений: %77.10\n",
|
||
"Embarked процент пустых значений: %0.22\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"print(df.isnull().sum())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"print(df.isnull().any())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Заполнение пропущенных данных\n",
|
||
"\n",
|
||
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
||
"\n",
|
||
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(891, 11)\n",
|
||
"Survived False\n",
|
||
"Pclass False\n",
|
||
"Name False\n",
|
||
"Sex False\n",
|
||
"Age False\n",
|
||
"SibSp False\n",
|
||
"Parch False\n",
|
||
"Ticket False\n",
|
||
"Fare False\n",
|
||
"Cabin False\n",
|
||
"Embarked False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" <th>AgeFillNA</th>\n",
|
||
" <th>AgeFillMedian</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>PassengerId</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>887</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Montvila, Rev. Juozas</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>211536</td>\n",
|
||
" <td>13.00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.00</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>W./C. 6607</td>\n",
|
||
" <td>23.45</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>28.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.00</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>891</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Dooley, Mr. Patrick</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>370376</td>\n",
|
||
" <td>7.75</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Q</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass Name \\\n",
|
||
"PassengerId \n",
|
||
"887 0 2 Montvila, Rev. Juozas \n",
|
||
"888 1 1 Graham, Miss. Margaret Edith \n",
|
||
"889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
|
||
"890 1 1 Behr, Mr. Karl Howell \n",
|
||
"891 0 3 Dooley, Mr. Patrick \n",
|
||
"\n",
|
||
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
|
||
"PassengerId \n",
|
||
"887 male 27.0 0 0 211536 13.00 NaN S \n",
|
||
"888 female 19.0 0 0 112053 30.00 B42 S \n",
|
||
"889 female NaN 1 2 W./C. 6607 23.45 NaN S \n",
|
||
"890 male 26.0 0 0 111369 30.00 C148 C \n",
|
||
"891 male 32.0 0 0 370376 7.75 NaN Q \n",
|
||
"\n",
|
||
" AgeFillNA AgeFillMedian \n",
|
||
"PassengerId \n",
|
||
"887 27.0 27.0 \n",
|
||
"888 19.0 19.0 \n",
|
||
"889 0.0 28.0 \n",
|
||
"890 26.0 26.0 \n",
|
||
"891 32.0 32.0 "
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fillna_df = df.fillna(0)\n",
|
||
"\n",
|
||
"print(fillna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())\n",
|
||
"\n",
|
||
"# Замена пустых данных на 0\n",
|
||
"df[\"AgeFillNA\"] = df[\"Age\"].fillna(0)\n",
|
||
"\n",
|
||
"# Замена пустых данных на медиану\n",
|
||
"df[\"AgeFillMedian\"] = df[\"Age\"].fillna(df[\"Age\"].median())\n",
|
||
"\n",
|
||
"df.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Parch</th>\n",
|
||
" <th>Ticket</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" <th>Cabin</th>\n",
|
||
" <th>Embarked</th>\n",
|
||
" <th>AgeFillNA</th>\n",
|
||
" <th>AgeFillMedian</th>\n",
|
||
" <th>AgeCopy</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>PassengerId</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>887</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>Montvila, Rev. Juozas</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>211536</td>\n",
|
||
" <td>13.00</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" <td>27.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>888</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Graham, Miss. Margaret Edith</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>112053</td>\n",
|
||
" <td>30.00</td>\n",
|
||
" <td>B42</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>889</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>W./C. 6607</td>\n",
|
||
" <td>23.45</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>28.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>890</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Behr, Mr. Karl Howell</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>111369</td>\n",
|
||
" <td>30.00</td>\n",
|
||
" <td>C148</td>\n",
|
||
" <td>C</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>891</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Dooley, Mr. Patrick</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>370376</td>\n",
|
||
" <td>7.75</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Q</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" <td>32.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass Name \\\n",
|
||
"PassengerId \n",
|
||
"887 0 2 Montvila, Rev. Juozas \n",
|
||
"888 1 1 Graham, Miss. Margaret Edith \n",
|
||
"889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
|
||
"890 1 1 Behr, Mr. Karl Howell \n",
|
||
"891 0 3 Dooley, Mr. Patrick \n",
|
||
"\n",
|
||
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \\\n",
|
||
"PassengerId \n",
|
||
"887 male 27.0 0 0 211536 13.00 NaN S \n",
|
||
"888 female 19.0 0 0 112053 30.00 B42 S \n",
|
||
"889 female NaN 1 2 W./C. 6607 23.45 NaN S \n",
|
||
"890 male 26.0 0 0 111369 30.00 C148 C \n",
|
||
"891 male 32.0 0 0 370376 7.75 NaN Q \n",
|
||
"\n",
|
||
" AgeFillNA AgeFillMedian AgeCopy \n",
|
||
"PassengerId \n",
|
||
"887 27.0 27.0 27.0 \n",
|
||
"888 19.0 19.0 19.0 \n",
|
||
"889 0.0 28.0 0.0 \n",
|
||
"890 26.0 26.0 26.0 \n",
|
||
"891 32.0 32.0 32.0 "
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df[\"AgeCopy\"] = df[\"Age\"]\n",
|
||
"\n",
|
||
"# Замена данных сразу в DataFrame без копирования\n",
|
||
"df.fillna({\"AgeCopy\": 0}, inplace=True)\n",
|
||
"\n",
|
||
"df.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Удаление наблюдений с пропусками"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(183, 14)\n",
|
||
"Survived False\n",
|
||
"Pclass False\n",
|
||
"Name False\n",
|
||
"Sex False\n",
|
||
"Age False\n",
|
||
"SibSp False\n",
|
||
"Parch False\n",
|
||
"Ticket False\n",
|
||
"Fare False\n",
|
||
"Cabin False\n",
|
||
"Embarked False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"dropna_df = df.dropna()\n",
|
||
"\n",
|
||
"print(dropna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Создание выборок данных\n",
|
||
"\n",
|
||
"Библиотека scikit-learn\n",
|
||
"\n",
|
||
"https://scikit-learn.org/stable/index.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Функция для создания выборок\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" \"\"\"\n",
|
||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||
" following fractional ratios provided by the user, where each subset is\n",
|
||
" stratified by the values in a specific column (that is, each subset has\n",
|
||
" the same relative frequency of the values in the column). It performs this\n",
|
||
" splitting by running train_test_split() twice.\n",
|
||
"\n",
|
||
" Parameters\n",
|
||
" ----------\n",
|
||
" df_input : Pandas dataframe\n",
|
||
" Input dataframe to be split.\n",
|
||
" stratify_colname : str\n",
|
||
" The name of the column that will be used for stratification. Usually\n",
|
||
" this column would be for the label.\n",
|
||
" frac_train : float\n",
|
||
" frac_val : float\n",
|
||
" frac_test : float\n",
|
||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||
" test data. The values should be expressed as float fractions and should\n",
|
||
" sum to 1.0.\n",
|
||
" random_state : int, None, or RandomStateInstance\n",
|
||
" Value to be passed to train_test_split().\n",
|
||
"\n",
|
||
" Returns\n",
|
||
" -------\n",
|
||
" df_train, df_val, df_test :\n",
|
||
" Dataframes containing the three splits.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
"\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Pclass\n",
|
||
"3 491\n",
|
||
"1 216\n",
|
||
"2 184\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка: (534, 3)\n",
|
||
"Pclass\n",
|
||
"3 294\n",
|
||
"1 130\n",
|
||
"2 110\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Контрольная выборка: (178, 3)\n",
|
||
"Pclass\n",
|
||
"3 98\n",
|
||
"1 43\n",
|
||
"2 37\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Тестовая выборка: (179, 3)\n",
|
||
"Pclass\n",
|
||
"3 99\n",
|
||
"1 43\n",
|
||
"2 37\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"print(df.Pclass.value_counts())\n",
|
||
"\n",
|
||
"data = df[[\"Pclass\", \"Survived\", \"AgeFillMedian\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"Pclass\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.Pclass.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"print(df_val.Pclass.value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"print(df_test.Pclass.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выборка с избытком (oversampling)\n",
|
||
"\n",
|
||
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
||
"\n",
|
||
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
||
"\n",
|
||
"Выборка с недостатком (undersampling)\n",
|
||
"\n",
|
||
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
||
"\n",
|
||
"Библиотека imbalanced-learn\n",
|
||
"\n",
|
||
"https://imbalanced-learn.org/stable/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "NameError",
|
||
"evalue": "name 'df_train' is not defined",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[2], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mimblearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mover_sampling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ADASYN\n\u001b[0;32m 3\u001b[0m ada \u001b[38;5;241m=\u001b[39m ADASYN()\n\u001b[1;32m----> 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mОбучающая выборка: \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[43mdf_train\u001b[49m\u001b[38;5;241m.\u001b[39mshape)\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(df_train\u001b[38;5;241m.\u001b[39mPclass\u001b[38;5;241m.\u001b[39mvalue_counts())\n\u001b[0;32m 8\u001b[0m X_resampled, y_resampled \u001b[38;5;241m=\u001b[39m ada\u001b[38;5;241m.\u001b[39mfit_resample(df_train, df_train[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPclass\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n",
|
||
"\u001b[1;31mNameError\u001b[0m: name 'df_train' is not defined"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.Pclass.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Pclass\"])\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"print(df_train_adasyn.Pclass.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|