{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная работа №1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Основные возможности библиотеки Pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tЗагрузка и сохранение данных"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"diabetes.csv\")\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnew.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
"\u001b[1;31mNameError\u001b[0m: name 'df' is not defined"
]
}
],
"source": [
"df.to_csv(\"new.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Получение сведений о датафрейме с данными"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 3.845052 | \n",
" 120.894531 | \n",
" 69.105469 | \n",
" 20.536458 | \n",
" 79.799479 | \n",
" 31.992578 | \n",
" 0.471876 | \n",
" 33.240885 | \n",
" 0.348958 | \n",
"
\n",
" \n",
" std | \n",
" 3.369578 | \n",
" 31.972618 | \n",
" 19.355807 | \n",
" 15.952218 | \n",
" 115.244002 | \n",
" 7.884160 | \n",
" 0.331329 | \n",
" 11.760232 | \n",
" 0.476951 | \n",
"
\n",
" \n",
" min | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.078000 | \n",
" 21.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 1.000000 | \n",
" 99.000000 | \n",
" 62.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 27.300000 | \n",
" 0.243750 | \n",
" 24.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 3.000000 | \n",
" 117.000000 | \n",
" 72.000000 | \n",
" 23.000000 | \n",
" 30.500000 | \n",
" 32.000000 | \n",
" 0.372500 | \n",
" 29.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 6.000000 | \n",
" 140.250000 | \n",
" 80.000000 | \n",
" 32.000000 | \n",
" 127.250000 | \n",
" 36.600000 | \n",
" 0.626250 | \n",
" 41.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" max | \n",
" 17.000000 | \n",
" 199.000000 | \n",
" 122.000000 | \n",
" 99.000000 | \n",
" 846.000000 | \n",
" 67.100000 | \n",
" 2.420000 | \n",
" 81.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 768 entries, 0 to 767\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Pregnancies 768 non-null int64 \n",
" 1 Glucose 768 non-null int64 \n",
" 2 BloodPressure 768 non-null int64 \n",
" 3 SkinThickness 768 non-null int64 \n",
" 4 Insulin 768 non-null int64 \n",
" 5 BMI 768 non-null float64\n",
" 6 DiabetesPedigreeFunction 768 non-null float64\n",
" 7 Age 768 non-null int64 \n",
" 8 Outcome 768 non-null int64 \n",
"dtypes: float64(2), int64(7)\n",
"memory usage: 54.1 KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tПолучение сведений о колонках датафрейма"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
" dtype='object')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tВывод отельных строки и столбцов из датафрейма"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" Insulin | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 32 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 21 | \n",
" 94 | \n",
"
\n",
" \n",
" 4 | \n",
" 33 | \n",
" 168 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 63 | \n",
" 180 | \n",
"
\n",
" \n",
" 764 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 30 | \n",
" 112 | \n",
"
\n",
" \n",
" 766 | \n",
" 47 | \n",
" 0 | \n",
"
\n",
" \n",
" 767 | \n",
" 23 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Age Insulin\n",
"0 50 0\n",
"1 31 0\n",
"2 32 0\n",
"3 21 94\n",
"4 33 168\n",
".. ... ...\n",
"763 63 180\n",
"764 27 0\n",
"765 30 112\n",
"766 47 0\n",
"767 23 0\n",
"\n",
"[768 rows x 2 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[[\"Age\", \"Insulin\"]]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" 116 | \n",
" 74 | \n",
" 0 | \n",
" 0 | \n",
" 25.6 | \n",
" 0.201 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"5 5 116 74 0 0 25.6 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n",
"5 0.201 30 0 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[3:6]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" 2 | \n",
" 197 | \n",
" 70 | \n",
" 45 | \n",
" 543 | \n",
" 30.5 | \n",
" 0.158 | \n",
" 53 | \n",
" 1 | \n",
"
\n",
" \n",
" 13 | \n",
" 1 | \n",
" 189 | \n",
" 60 | \n",
" 23 | \n",
" 846 | \n",
" 30.1 | \n",
" 0.398 | \n",
" 59 | \n",
" 1 | \n",
"
\n",
" \n",
" 14 | \n",
" 5 | \n",
" 166 | \n",
" 72 | \n",
" 19 | \n",
" 175 | \n",
" 25.8 | \n",
" 0.587 | \n",
" 51 | \n",
" 1 | \n",
"
\n",
" \n",
" 16 | \n",
" 0 | \n",
" 118 | \n",
" 84 | \n",
" 47 | \n",
" 230 | \n",
" 45.8 | \n",
" 0.551 | \n",
" 31 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 748 | \n",
" 3 | \n",
" 187 | \n",
" 70 | \n",
" 22 | \n",
" 200 | \n",
" 36.4 | \n",
" 0.408 | \n",
" 36 | \n",
" 1 | \n",
"
\n",
" \n",
" 753 | \n",
" 0 | \n",
" 181 | \n",
" 88 | \n",
" 44 | \n",
" 510 | \n",
" 43.3 | \n",
" 0.222 | \n",
" 26 | \n",
" 1 | \n",
"
\n",
" \n",
" 755 | \n",
" 1 | \n",
" 128 | \n",
" 88 | \n",
" 39 | \n",
" 110 | \n",
" 36.5 | \n",
" 1.057 | \n",
" 37 | \n",
" 1 | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
243 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"4 0 137 40 35 168 43.1 \n",
"8 2 197 70 45 543 30.5 \n",
"13 1 189 60 23 846 30.1 \n",
"14 5 166 72 19 175 25.8 \n",
"16 0 118 84 47 230 45.8 \n",
".. ... ... ... ... ... ... \n",
"748 3 187 70 22 200 36.4 \n",
"753 0 181 88 44 510 43.3 \n",
"755 1 128 88 39 110 36.5 \n",
"763 10 101 76 48 180 32.9 \n",
"765 5 121 72 23 112 26.2 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"4 2.288 33 1 \n",
"8 0.158 53 1 \n",
"13 0.398 59 1 \n",
"14 0.587 51 1 \n",
"16 0.551 31 1 \n",
".. ... ... ... \n",
"748 0.408 36 1 \n",
"753 0.222 26 1 \n",
"755 1.057 37 1 \n",
"763 0.171 63 0 \n",
"765 0.245 30 0 \n",
"\n",
"[243 rows x 9 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['Insulin'] > 100]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tГруппировка и агрегация данных в датафрейме"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Insulin | \n",
"
\n",
" \n",
" Pregnancies | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 81.675676 | \n",
"
\n",
" \n",
" 1 | \n",
" 98.674074 | \n",
"
\n",
" \n",
" 2 | \n",
" 85.844660 | \n",
"
\n",
" \n",
" 3 | \n",
" 87.453333 | \n",
"
\n",
" \n",
" 4 | \n",
" 69.441176 | \n",
"
\n",
" \n",
" 5 | \n",
" 57.298246 | \n",
"
\n",
" \n",
" 6 | \n",
" 63.580000 | \n",
"
\n",
" \n",
" 7 | \n",
" 84.466667 | \n",
"
\n",
" \n",
" 8 | \n",
" 92.815789 | \n",
"
\n",
" \n",
" 9 | \n",
" 62.428571 | \n",
"
\n",
" \n",
" 10 | \n",
" 34.791667 | \n",
"
\n",
" \n",
" 11 | \n",
" 65.454545 | \n",
"
\n",
" \n",
" 12 | \n",
" 112.555556 | \n",
"
\n",
" \n",
" 13 | \n",
" 27.900000 | \n",
"
\n",
" \n",
" 14 | \n",
" 92.000000 | \n",
"
\n",
" \n",
" 15 | \n",
" 110.000000 | \n",
"
\n",
" \n",
" 17 | \n",
" 114.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Insulin\n",
"Pregnancies \n",
"0 81.675676\n",
"1 98.674074\n",
"2 85.844660\n",
"3 87.453333\n",
"4 69.441176\n",
"5 57.298246\n",
"6 63.580000\n",
"7 84.466667\n",
"8 92.815789\n",
"9 62.428571\n",
"10 34.791667\n",
"11 65.454545\n",
"12 112.555556\n",
"13 27.900000\n",
"14 92.000000\n",
"15 110.000000\n",
"17 114.000000"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"group = df.groupby(['Pregnancies'])['Insulin'].mean()\n",
"group.to_frame()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Сортировка данных в датафрейме"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 661 | \n",
" 1 | \n",
" 199 | \n",
" 76 | \n",
" 43 | \n",
" 0 | \n",
" 42.9 | \n",
" 1.394 | \n",
" 22 | \n",
" 1 | \n",
"
\n",
" \n",
" 561 | \n",
" 0 | \n",
" 198 | \n",
" 66 | \n",
" 32 | \n",
" 274 | \n",
" 41.3 | \n",
" 0.502 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 228 | \n",
" 4 | \n",
" 197 | \n",
" 70 | \n",
" 39 | \n",
" 744 | \n",
" 36.7 | \n",
" 2.329 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 2 | \n",
" 197 | \n",
" 70 | \n",
" 45 | \n",
" 543 | \n",
" 30.5 | \n",
" 0.158 | \n",
" 53 | \n",
" 1 | \n",
"
\n",
" \n",
" 579 | \n",
" 2 | \n",
" 197 | \n",
" 70 | \n",
" 99 | \n",
" 0 | \n",
" 34.7 | \n",
" 0.575 | \n",
" 62 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 342 | \n",
" 1 | \n",
" 0 | \n",
" 68 | \n",
" 35 | \n",
" 0 | \n",
" 32.0 | \n",
" 0.389 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
" 349 | \n",
" 5 | \n",
" 0 | \n",
" 80 | \n",
" 32 | \n",
" 0 | \n",
" 41.0 | \n",
" 0.346 | \n",
" 37 | \n",
" 1 | \n",
"
\n",
" \n",
" 502 | \n",
" 6 | \n",
" 0 | \n",
" 68 | \n",
" 41 | \n",
" 0 | \n",
" 39.0 | \n",
" 0.727 | \n",
" 41 | \n",
" 1 | \n",
"
\n",
" \n",
" 182 | \n",
" 1 | \n",
" 0 | \n",
" 74 | \n",
" 20 | \n",
" 23 | \n",
" 27.7 | \n",
" 0.299 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 75 | \n",
" 1 | \n",
" 0 | \n",
" 48 | \n",
" 20 | \n",
" 0 | \n",
" 24.7 | \n",
" 0.140 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"661 1 199 76 43 0 42.9 \n",
"561 0 198 66 32 274 41.3 \n",
"228 4 197 70 39 744 36.7 \n",
"8 2 197 70 45 543 30.5 \n",
"579 2 197 70 99 0 34.7 \n",
".. ... ... ... ... ... ... \n",
"342 1 0 68 35 0 32.0 \n",
"349 5 0 80 32 0 41.0 \n",
"502 6 0 68 41 0 39.0 \n",
"182 1 0 74 20 23 27.7 \n",
"75 1 0 48 20 0 24.7 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"661 1.394 22 1 \n",
"561 0.502 28 1 \n",
"228 2.329 31 0 \n",
"8 0.158 53 1 \n",
"579 0.575 62 1 \n",
".. ... ... ... \n",
"342 0.389 22 0 \n",
"349 0.346 37 1 \n",
"502 0.727 41 1 \n",
"182 0.299 21 0 \n",
"75 0.140 22 0 \n",
"\n",
"[768 rows x 9 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_df = df.sort_values(by='Glucose', ascending = False)\n",
"sorted_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tУдаление строк/столбцов"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df_dropped_columns = df.drop(columns=['Insulin', 'BMI']) # Удаление столбцов 'Insulin' и 'BMI'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 7 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness \\\n",
"0 6 148 72 35 \n",
"1 1 85 66 29 \n",
"2 8 183 64 0 \n",
"3 1 89 66 23 \n",
"4 0 137 40 35 \n",
".. ... ... ... ... \n",
"763 10 101 76 48 \n",
"764 2 122 70 27 \n",
"765 5 121 72 23 \n",
"766 1 126 60 0 \n",
"767 1 93 70 31 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n",
".. ... ... ... \n",
"763 0.171 63 0 \n",
"764 0.340 27 0 \n",
"765 0.245 30 0 \n",
"766 0.349 47 1 \n",
"767 0.315 23 0 \n",
"\n",
"[768 rows x 7 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dropped_columns"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" 116 | \n",
" 74 | \n",
" 0 | \n",
" 0 | \n",
" 25.6 | \n",
" 0.201 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 3 | \n",
" 78 | \n",
" 50 | \n",
" 32 | \n",
" 88 | \n",
" 31.0 | \n",
" 0.248 | \n",
" 26 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
766 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"5 5 116 74 0 0 25.6 \n",
"6 3 78 50 32 88 31.0 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n",
"5 0.201 30 0 \n",
"6 0.248 26 1 \n",
".. ... ... ... \n",
"763 0.171 63 0 \n",
"764 0.340 27 0 \n",
"765 0.245 30 0 \n",
"766 0.349 47 1 \n",
"767 0.315 23 0 \n",
"\n",
"[766 rows x 9 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dropped_rows = df.drop([0, 1]) # Удаление строк с индексами 0 и 1\n",
"df_dropped_rows"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tСоздание новых столбцов на основе данных из существующих столбцов датафрейма"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"df['Glucose-BP'] = df['Glucose'] - df['BloodPressure']\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
" Glucose-BP | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
" 76 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
" 19 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
" 119 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
" 23 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
" 97 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
" 25 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
" 52 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
" 23 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 10 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome Glucose-BP \n",
"0 0.627 50 1 76 \n",
"1 0.351 31 0 19 \n",
"2 0.672 32 1 119 \n",
"3 0.167 21 0 23 \n",
"4 2.288 33 1 97 \n",
".. ... ... ... ... \n",
"763 0.171 63 0 25 \n",
"764 0.340 27 0 52 \n",
"765 0.245 30 0 49 \n",
"766 0.349 47 1 66 \n",
"767 0.315 23 0 23 \n",
"\n",
"[768 rows x 10 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tУдаление строк с пустыми значениями"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pregnancies 0\n",
"Glucose 0\n",
"BloodPressure 0\n",
"SkinThickness 0\n",
"Insulin 0\n",
"BMI 0\n",
"DiabetesPedigreeFunction 0\n",
"Age 0\n",
"Outcome 0\n",
"Glucose-BP 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(df.isna().sum())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
" Glucose-BP | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
" 76 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
" 19 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
" 119 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
" 23 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
" 97 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
" 25 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
" 52 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
" 49 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
" 66 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
" 23 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 10 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome Glucose-BP \n",
"0 0.627 50 1 76 \n",
"1 0.351 31 0 19 \n",
"2 0.672 32 1 119 \n",
"3 0.167 21 0 23 \n",
"4 2.288 33 1 97 \n",
".. ... ... ... ... \n",
"763 0.171 63 0 25 \n",
"764 0.340 27 0 52 \n",
"765 0.245 30 0 49 \n",
"766 0.349 47 1 66 \n",
"767 0.315 23 0 23 \n",
"\n",
"[768 rows x 10 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna() #Тк.пустых строк нет, мы ничего не удалили"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###\tЗаполнение пустых значений на основе существующих данных"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"df.fillna(df.mean(), inplace=True)\n",
"df.fillna(df.median(), inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Мы обрабатываем пустые значения для каждого столбца отдельно\n",
"\n",
"Мы можем заполнить пропуски средним или медианой, если это числовой столбец\n",
"\n",
"Мы заполняем средним, если в колонке нет выбросов\n",
"\n",
"Если столбец категориальный, то мы можем заполнить пропуски модой (самым часто встречающимся значением)\n",
"\n",
"Если пропусков мало, то их можно просто удалить."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Возможности визуализации"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"