1918 lines
494 KiB
Plaintext
1918 lines
494 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "### 1) Информация о наборе данных о возрасте",
|
|||
|
"id": "b0d69d9cbffe7aa9"
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"id": "initial_id",
|
|||
|
"metadata": {
|
|||
|
"collapsed": true,
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:34.142613Z",
|
|||
|
"start_time": "2024-12-07T05:04:31.513373Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"AgeDataset-V1.csv\")\n",
|
|||
|
"sampled_df = df.sample(frac=0.4)\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"print(df)"
|
|||
|
],
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Id', 'Name', 'Short description', 'Gender', 'Country', 'Occupation',\n",
|
|||
|
" 'Birth year', 'Death year', 'Manner of death', 'Age of death'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
" Id Name \\\n",
|
|||
|
"0 Q23 George Washington \n",
|
|||
|
"1 Q42 Douglas Adams \n",
|
|||
|
"2 Q91 Abraham Lincoln \n",
|
|||
|
"3 Q254 Wolfgang Amadeus Mozart \n",
|
|||
|
"4 Q255 Ludwig van Beethoven \n",
|
|||
|
"... ... ... \n",
|
|||
|
"1223004 Q77247326 Marie-Fortunée Besson \n",
|
|||
|
"1223005 Q77249504 Ron Thorsen \n",
|
|||
|
"1223006 Q77249818 Diether Todenhagen \n",
|
|||
|
"1223007 Q77253909 Reginald Oswald Pearson \n",
|
|||
|
"1223008 Q77254864 Horst Lerche \n",
|
|||
|
"\n",
|
|||
|
" Short description Gender \\\n",
|
|||
|
"0 1st president of the United States (1732–1799) Male \n",
|
|||
|
"1 English writer and humorist Male \n",
|
|||
|
"2 16th president of the United States (1809-1865) Male \n",
|
|||
|
"3 Austrian composer of the Classical period Male \n",
|
|||
|
"4 German classical and romantic composer Male \n",
|
|||
|
"... ... ... \n",
|
|||
|
"1223004 Frans model (1907-1996) NaN \n",
|
|||
|
"1223005 xugador de baloncestu canadianu (1948–2004) NaN \n",
|
|||
|
"1223006 German navy officer and world war II U-boat co... NaN \n",
|
|||
|
"1223007 English artist, working in stained glass, prin... Male \n",
|
|||
|
"1223008 German painter Male \n",
|
|||
|
"\n",
|
|||
|
" Country \\\n",
|
|||
|
"0 United States of America; Kingdom of Great Bri... \n",
|
|||
|
"1 United Kingdom \n",
|
|||
|
"2 United States of America \n",
|
|||
|
"3 Archduchy of Austria; Archbishopric of Salzburg \n",
|
|||
|
"4 Holy Roman Empire; Austrian Empire \n",
|
|||
|
"... ... \n",
|
|||
|
"1223004 France \n",
|
|||
|
"1223005 Canada; United States of America \n",
|
|||
|
"1223006 Germany \n",
|
|||
|
"1223007 United Kingdom \n",
|
|||
|
"1223008 Germany \n",
|
|||
|
"\n",
|
|||
|
" Occupation Birth year Death year Manner of death \\\n",
|
|||
|
"0 Politician 1732 1799.0 natural causes \n",
|
|||
|
"1 Artist 1952 2001.0 natural causes \n",
|
|||
|
"2 Politician 1809 1865.0 homicide \n",
|
|||
|
"3 Artist 1756 1791.0 NaN \n",
|
|||
|
"4 Artist 1770 1827.0 NaN \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"1223004 Tailor; model 1907 1996.0 NaN \n",
|
|||
|
"1223005 Athlete 1948 2004.0 NaN \n",
|
|||
|
"1223006 Military personnel 1920 1944.0 NaN \n",
|
|||
|
"1223007 Artist 1887 1915.0 NaN \n",
|
|||
|
"1223008 Artist 1938 2017.0 NaN \n",
|
|||
|
"\n",
|
|||
|
" Age of death \n",
|
|||
|
"0 67.0 \n",
|
|||
|
"1 49.0 \n",
|
|||
|
"2 56.0 \n",
|
|||
|
"3 35.0 \n",
|
|||
|
"4 57.0 \n",
|
|||
|
"... ... \n",
|
|||
|
"1223004 89.0 \n",
|
|||
|
"1223005 56.0 \n",
|
|||
|
"1223006 24.0 \n",
|
|||
|
"1223007 28.0 \n",
|
|||
|
"1223008 79.0 \n",
|
|||
|
"\n",
|
|||
|
"[1223009 rows x 10 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 2
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": [
|
|||
|
"## Столбцы на русском:\n",
|
|||
|
"```Id``` - идентификатор - страна<br/>\n",
|
|||
|
"```Name```- Полное имя<br/>\n",
|
|||
|
"```Short description``` - Краткое описание <br/>\n",
|
|||
|
"```Gender``` - пол<br/>\n",
|
|||
|
"```Country``` - Страна / исторический регион<br/>\n",
|
|||
|
"```Occupation``` - Название профессии<br/>\n",
|
|||
|
"```Birth year``` - Год рождения<br/>\n",
|
|||
|
"```Death year``` - Год смерти<br/>\n",
|
|||
|
"```Manner of death``` - Способ смерти<br/>\n",
|
|||
|
"```Age of death``` - Возраст смерти<br/>"
|
|||
|
],
|
|||
|
"id": "3670f952147eb566"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:34.393967Z",
|
|||
|
"start_time": "2024-12-07T05:04:34.168224Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
],
|
|||
|
"id": "ddb102a236b75928",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 1223009 entries, 0 to 1223008\n",
|
|||
|
"Data columns (total 10 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Id 1223009 non-null object \n",
|
|||
|
" 1 Name 1223009 non-null object \n",
|
|||
|
" 2 Short description 1155109 non-null object \n",
|
|||
|
" 3 Gender 1089363 non-null object \n",
|
|||
|
" 4 Country 887500 non-null object \n",
|
|||
|
" 5 Occupation 1016095 non-null object \n",
|
|||
|
" 6 Birth year 1223009 non-null int64 \n",
|
|||
|
" 7 Death year 1223008 non-null float64\n",
|
|||
|
" 8 Manner of death 53603 non-null object \n",
|
|||
|
" 9 Age of death 1223008 non-null float64\n",
|
|||
|
"dtypes: float64(2), int64(1), object(7)\n",
|
|||
|
"memory usage: 93.3+ MB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
" Id Name \\\n",
|
|||
|
"0 Q23 George Washington \n",
|
|||
|
"1 Q42 Douglas Adams \n",
|
|||
|
"2 Q91 Abraham Lincoln \n",
|
|||
|
"3 Q254 Wolfgang Amadeus Mozart \n",
|
|||
|
"4 Q255 Ludwig van Beethoven \n",
|
|||
|
"\n",
|
|||
|
" Short description Gender \\\n",
|
|||
|
"0 1st president of the United States (1732–1799) Male \n",
|
|||
|
"1 English writer and humorist Male \n",
|
|||
|
"2 16th president of the United States (1809-1865) Male \n",
|
|||
|
"3 Austrian composer of the Classical period Male \n",
|
|||
|
"4 German classical and romantic composer Male \n",
|
|||
|
"\n",
|
|||
|
" Country Occupation Birth year \\\n",
|
|||
|
"0 United States of America; Kingdom of Great Bri... Politician 1732 \n",
|
|||
|
"1 United Kingdom Artist 1952 \n",
|
|||
|
"2 United States of America Politician 1809 \n",
|
|||
|
"3 Archduchy of Austria; Archbishopric of Salzburg Artist 1756 \n",
|
|||
|
"4 Holy Roman Empire; Austrian Empire Artist 1770 \n",
|
|||
|
"\n",
|
|||
|
" Death year Manner of death Age of death \n",
|
|||
|
"0 1799.0 natural causes 67.0 \n",
|
|||
|
"1 2001.0 natural causes 49.0 \n",
|
|||
|
"2 1865.0 homicide 56.0 \n",
|
|||
|
"3 1791.0 NaN 35.0 \n",
|
|||
|
"4 1827.0 NaN 57.0 "
|
|||
|
],
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Id</th>\n",
|
|||
|
" <th>Name</th>\n",
|
|||
|
" <th>Short description</th>\n",
|
|||
|
" <th>Gender</th>\n",
|
|||
|
" <th>Country</th>\n",
|
|||
|
" <th>Occupation</th>\n",
|
|||
|
" <th>Birth year</th>\n",
|
|||
|
" <th>Death year</th>\n",
|
|||
|
" <th>Manner of death</th>\n",
|
|||
|
" <th>Age of death</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>Q23</td>\n",
|
|||
|
" <td>George Washington</td>\n",
|
|||
|
" <td>1st president of the United States (1732–1799)</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>United States of America; Kingdom of Great Bri...</td>\n",
|
|||
|
" <td>Politician</td>\n",
|
|||
|
" <td>1732</td>\n",
|
|||
|
" <td>1799.0</td>\n",
|
|||
|
" <td>natural causes</td>\n",
|
|||
|
" <td>67.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>Q42</td>\n",
|
|||
|
" <td>Douglas Adams</td>\n",
|
|||
|
" <td>English writer and humorist</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>United Kingdom</td>\n",
|
|||
|
" <td>Artist</td>\n",
|
|||
|
" <td>1952</td>\n",
|
|||
|
" <td>2001.0</td>\n",
|
|||
|
" <td>natural causes</td>\n",
|
|||
|
" <td>49.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>Q91</td>\n",
|
|||
|
" <td>Abraham Lincoln</td>\n",
|
|||
|
" <td>16th president of the United States (1809-1865)</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>Politician</td>\n",
|
|||
|
" <td>1809</td>\n",
|
|||
|
" <td>1865.0</td>\n",
|
|||
|
" <td>homicide</td>\n",
|
|||
|
" <td>56.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>Q254</td>\n",
|
|||
|
" <td>Wolfgang Amadeus Mozart</td>\n",
|
|||
|
" <td>Austrian composer of the Classical period</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>Archduchy of Austria; Archbishopric of Salzburg</td>\n",
|
|||
|
" <td>Artist</td>\n",
|
|||
|
" <td>1756</td>\n",
|
|||
|
" <td>1791.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Q255</td>\n",
|
|||
|
" <td>Ludwig van Beethoven</td>\n",
|
|||
|
" <td>German classical and romantic composer</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>Holy Roman Empire; Austrian Empire</td>\n",
|
|||
|
" <td>Artist</td>\n",
|
|||
|
" <td>1770</td>\n",
|
|||
|
" <td>1827.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>57.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 3
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Объект наблюдения жизнь людей содержит такие данные как дата рождения, дата смерти причина смерти профессия и тп",
|
|||
|
"id": "481e7e75c47a6235"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:48.557176Z",
|
|||
|
"start_time": "2024-12-07T05:04:34.437056Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"\n",
|
|||
|
"df1 = df[df['Birth year'] >= 0]\n",
|
|||
|
"\n",
|
|||
|
"plt.scatter(df1['Birth year'], df1['Age of death'], c=df1['Age of death'], alpha=0.6)\n",
|
|||
|
"\n",
|
|||
|
"plt.title(\"Диаграмма 1\")\n",
|
|||
|
"plt.ylabel(\"Возраст смерти\")\n",
|
|||
|
"plt.xlabel(\"Дата рождения\")\n",
|
|||
|
"plt.grid(visible=True)\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "13f9f5235d21fa36",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA08AAAIhCAYAAACWt4GEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOz9d5ydV3Xo/3/2c9r0Jo16lyxZcpHkJvdOtTFgTEKHlJsC5v5ucC4/CN/c1/feEODmCzehmAQnfAFTQjfF2Bj3bkmWbPU2qqORNL2dmVOfvb9/rOe06bI1Gkle72Sw5pTn7LNPmb2etffaxjnnUEoppZRSSik1Jm+qG6CUUkoppZRSZwMNnpRSSimllFJqAjR4UkoppZRSSqkJ0OBJKaWUUkoppSZAgyellFJKKaWUmgANnpRSSimllFJqAjR4UkoppZRSSqkJ0OBJKaWUUkoppSZAgyellFJKKaWUmgANnpRSSr0un/nMZ1ixYsWIP5/5zGemunnnpHg8zs0338wvf/nLqW6KUkq9oYSnugFKKaXOfo2NjXzjG98ouezuu++eotac23p7e/n4xz9OS0vLVDdFKaXecDR4Ukop9br4vk9FRQVr1qwpuTwajU5Ng85hjz/+OP/4j//IwMDAVDdFKaXekHTanlJKqdclm81SVlY27u0+85nPcPPNN5dc9uMf/5gVK1bw9a9/PX/Z7t27ufvuu7nyyiu54IILuO666/j85z9PMpnM32akKYIAN998M//8z//MF77wBS6//HLWrVvHpz/9aXp6ekoe92c/+xl33nkna9as4eKLL+ad73wnDz/8cP76X/7yl/njtra2ltz3f//v/82KFSv48Ic/PKw9//qv/1py27179+avO3r06IQffyR9fX3cfffdXH755fzHf/zHmLdVSik1OTTzpJRS6nVJJBLU1tae9P16e3v5l3/5l5LL2tra+OAHP8iaNWv40pe+RDQa5ZlnnuE73/kOM2bM4C/+4i/yt73rrrt473vfO+y4P/rRj1i4cCFf/OIX6erq4itf+QqHDx/mxz/+McYYfvjDH/L5z3+eT37yk1x66aX09vby7//+7/zt3/4ta9euZdasWfljVVZW8vjjj/OBD3wAAOccDz/8MJ43/NxjZWUlTzzxBH/913+dv+yhhx7C8zystfnLTubxi5WVlfG73/2OJUuWlARiSimlTh8NnpRSSr0uPT09zJgx46Tv97WvfY05c+bQ3d2dv2zv3r2sXLmSr371q1RVVQFw9dVX8/zzz7N+/fqS4GnWrFnDpgoCeJ7Hd77zHaqrqwFoaGjgE5/4BM8++yzXX389zc3N/Nmf/Rkf//jH8/eZO3cud955J5s2beK2227LX3799deXBE+vvPIKvb29XHDBBcMe9/rrr+f3v/89bW1t+f54+OGHufzyy1m/fn3+difz+MWi0ShLliwZvUOVUkpNOg2elFJKvS5tbW1cdNFFJ3WfvXv38pOf/ITvf//7vO9978tffu2113LttdeSyWRoamri8OHD7N27l66uLurq6iZ07JtvvjkfOOV+D4fDbNy4keuvvz5fAbCvr48DBw5w+PDhfHCTTqdLjnXLLbfw2c9+lng8TlVVFQ899BA333wzbW1twx536dKlLFy4kCeeeIL3ve997Ny5k+PHj/PBD36wJHg6mcdXSil1ZtE1T0oppV6zdDrNiRMnTjoj8vnPf57bbruNtWvXllxureXLX/4yV1xxBbfddhv/8A//wK5du4jFYhM+9syZM0t+9zyP+vp6ent7AThy5Agf+9jHuPzyy/nQhz7Et7/9bbLZLCDT8oqtWbOG2tpannnmGay1/P73v+ftb3/7qI99yy238PjjjwMyZe/666/PZ9ByTubxlVJKnVk086SUUuo127VrF77vs2zZsgnf5+GHH2b79u185StfGXbdfffdx3e/+13+5//8n7z5zW/OZ5DuuuuuCR+/eBogSDXA7u5uGhoasNbyF3/xF0QiEX7+85+zcuVKwuEwTU1N/PrXvx52LGMMN910E48//jjTpk0jlUpx3XXX8d3vfnfEx77lllv4/ve/Tzwe56GHHuJv//ZvSwpdnOzjK6WUOrNo5kkppdRr9vTTT1NdXc3q1asndPt0Os0//dM/8YlPfILGxsZh12/atIlly5bxnve8Jx84tba2snfv3pKiC2N55plnSqa/Pf7442SzWa666iq6u7s5ePAgd911FxdddBHhcDh/H2DEx7j11lt55pln+M1vfsOtt946Zgn2tWvXUl1dzb333kt3dzc33XRTyfWv5fGVUkqdOTTzpJRS6jXZvHkzP/3pT1m1ahU7d+4cdn06naarq4sjR46wYMECANrb21m8eDEf+chHRjzmxRdfzDe/+U3uu+8+1qxZw+HDh/nWt75FOp0mkUhMqF3Hjx/nr//6r/nIRz7C8ePH+T//5/9w3XXXsW7dOkCKM/zwhz9k1qxZ1NTU8Oyzz3L//fcDjPgYV111FdlslgceeIBvfetbYz6253ncdNNNfO973+Otb30r5eXlJddPmzbtpB9fKaXUmUODJ6WUUq/J+9//fkACoj/+4z8e8TZPP/00DQ0NfOlLX8pf9rnPfY5IJDLi7f/yL/+S7u5u7r//fu69915mz57NO9/5TowxfOtb36Kvr4+ampox23XbbbdRU1PDf/tv/42Kigre/e538zd/8zf567/5zW/yj//4j3zmM58hGo2ybNky/vVf/5UvfOELvPzyyyX7NwHEYjGuu+46NmzYwFVXXTVuv9x66638/Oc/H3Vt1Mk+vlJKqTOHcbo6VSml1GuwYsUKvvjFL3LnnXeOepsPf/jDzJ07tyR4mkw333wzV1xxxWl7PKWUUm8suuZJKaWUUkoppSZAp+0ppZR6TVavXk1DQ8OYt1m6dOmIhSGUUkqps5FO21NKKaWUUkqpCdBpe0oppZRSSik1ARo8KaWUUkoppdQEaPCklFJKKaWUUhOgwZNSSimllFJKTYAGT0oppZRSSik1AWdEqfJ0Os2dd97J3//937Nu3To+85nP8MADDwy73bp167j//vsBuOyyy+jv7y+5fvPmzVRWVp7UY3d29jOV9QaNgWnTqqe8Hecq7d/Jo307ubR/J4/27eTRvp1c2r+TR/t2cp0N/Ztr43imPHhKpVLcc8897Nu3L3/Z5z73Oe6555787y0tLXz4wx/mIx/5CACtra309/fz2GOPUVZWlr9dRUXFST++c5wRL+KZ0o5zlfbv5NG+nVzav5NH+3byaN9OLu3fyaN9O7nOhf6d0uCpqamJe+65h6FbTVVXV1NdXYj8PvOZz/DWt76VW2+9FYD9+/fT2NjI/PnzT2t7lVJKKaWUUm9cU7rmacOGDaxbt46f/OQno97mxRdfZOPGjXzqU5/KX9bU1MTixYtPRxOVUkoppZRSCpjizNMHPvCBcW9z33338e53v5vZs2fnL9u/fz+JRIIPf/jDHDx4kJUrV/J3f/d3rymgMuak73JK5R5/qttxrtL+nTzat5NL+3fyaN9OHu3byaX9O3m0byfX2dC/E23blK95GktzczMvvfQSn/vc50ouP3DgAL29vXzqU5+iqqqKf//3f+djH/sYv/vd76iqqjqpx5jIwrDT4Uxpx7lK+3fyaN9OLu3fyaN9O3m0byeX9u/k0b6dXOdC/57RwdMjjzzCypUrWbZsWcnl3/72t8lkMvnKel/+8pe54YYbePLJJ3nHO95xUo8x1VU/zobqI2cz7d/Jo307ubR/J4/27eTRvp1c2r+TR/t2cp0N/XvWVNsby7PPPsstt9wy7PJoNEo0Gs3/HovFmDdvHq2trSf9GGdK1Y8zpR3nKu3fyaN9O7m0fyeP9u3k0b6dXNq/k0f7dnKdC/17xm6S65xj27ZtXHLJJcMuv/XWW/nlL3+Zv2xwcJDDhw+zZMmS091MpZRSSiml1BvEGZt5amlpYWBgYNiUPWMMN954I1//+teZO3cuDQ0NfPWrX2XWrFnccMMNU9RapZRSSiml1LnujA2eOjs7AaitrR123X//7/+dcDjMPffcQzwe58o
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 4
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Анализируем датафрейм при помощи \"ящика с усами\". Есть смещение в сторону больших значений, это можно исправить при помощи oversampling и undersampling.",
|
|||
|
"id": "1d636f51fd70c157"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:50.204024Z",
|
|||
|
"start_time": "2024-12-07T05:04:48.589403Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Birth year'])\n",
|
|||
|
"plt.title('Box Plot для Birth year')\n",
|
|||
|
"plt.xlabel('Birth year')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "51358c8282ab6255",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAy8klEQVR4nO3debxe86Hv8e/emUMGkQgnaCNkEJrJLC1ytBSn1Zgb4qBFa2irLYLbuoZze1CKIOaaihNSyj3q9Fy33GqLJhWn0iCIMWJHJjJn7+f+kZt97SQ72eEXmd7v1yuv9llrPWv91srvtbM/nvU8T1WlUqkEAACgoOq1PQAAAGDDIzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBsIHYWL9/dWM9b4B1ndAANmjHHXdcevXq1eDPLrvskuHDh+fZZ5/9TMYwZsyY5cbQp0+f7LrrrjnxxBMzduzY+m2vvfba9OrVa7X2/9577+Xkk0/OO++8U3roRb399tvLXYe+ffvmS1/6Un7yk59k+vTpy207ZsyYle5z7NixOfnkk1f7eQCsec3X9gAA1rQdd9wxP/3pT5MktbW1mTFjRu69996cdNJJGTNmTHbYYYfPZBwjR45Mly5dkiR1dXWZNm1arrvuuhx//PF54IEH0rt370+03z/+8Y958sknSw51jfrOd76TfffdN0myYMGCvP7667n22mszadKk/OpXv0qSbLHFFrn//vuz7bbbrnRfo0ePzquvvrqmhwzAJyA0gA3epptumv79+zdYttdee2XPPffMmDFjcs4553wm4+jTp0+23nrrBst23HHHfPnLX86vfvWrXHTRRZ/JONa2bbfdtsHfx+67754WLVrkvPPOyyuvvJIddtghLVu2XO7vDID1i1ungI1SmzZt0qpVq1RVVTVY/u///u8ZOnRoBgwYkL333js/+clPMmvWrCTJRx99lP322y8HHnhgFi5cmGTJ+wOGDx+evffeu8GtP0219dZbZ7PNNsu7777b6DYrG9OYMWMyYsSIJMk//uM/5txzz210P5MnT17u1qVevXplyJAhDbYbMmTIKrcbO3ZsjjnmmAwYMGCl+2qqDh06JEn938eyt0CNGTMmO+64Y0aPHp299947u+22W773ve/l17/+dd55553lbpeqqanJmWeemQEDBmS33XbLf/tv/y1z5sxZ4bEXL16cwYMH54c//OFy677yla/kggsuqH88evToHHzwwdlpp52y77775tprr01tbW2D54wePTpDhw5N//7984UvfCFf//rX89hjj9WvX9G5TJo06RNdN4B1mVc0gA1epVLJ4sWL6///zJkzc8cdd2ThwoU57LDD6re7/vrrc8011+Sb3/xmfvCDH+Stt97K1Vdfneeffz7/9m//lk033TSXXnppTjzxxIwaNSpnnnlm7rzzzjzzzDO56aab0qlTp9Ue24wZMzJjxoxGbxFa1Zj23XfffOc738kNN9yQkSNHrvT9HfPnz0+zZs3qb09auv8V/ZK7zz775Lvf/e4Kt/voo49y6qmnZvvtt8/Pf/7zbLbZZqmqqmp0X8uqq6ur//tYvHhxJk+enOuvvz577LFHtt9++0afV1tbm9tuuy2XXnppZsyYkUGDBmXevHmZMGFCRo4cmW233TZz585Nklx99dU57rjjcv3112fcuHG55pprsummm67w1avmzZvn0EMPzV133ZWPPvoom266aZIlMfXGG2/kZz/7WZLkxhtvzFVXXZVjjz02I0aMyN///vdce+21mTJlSv7lX/4lSXLPPffkkksuyRlnnJFBgwZl1qxZufnmm/OjH/0oAwYMyJZbbrnCc+nRo8cqrxvA+kZoABu85557Ln379l1u+VlnnVX/C96sWbNyww035Mgjj8xPfvKT+m169uyZYcOG5cEHH8ywYcOy11575aijjspNN92Ufv365corr8ywYcOyzz77rHIcH/8Fe8GCBZk8eXKuuOKKVFdX56ijjlpu+6aOaWmkrOjWrI+bN29eWrVq1eCWpMbiqFOnTo1u9/rrr2f27Nk5+eSTs99++61yX8s6//zzc/755zdY1rFjx9x1112rfO6pp55a//6Opcf8+G1WS0PjgAMOqH+lZ88998zTTz+dP//5z43u97DDDsvNN9+cxx9/vD4+H3rooXz+85/PwIED8+GHH+b666/PUUcdVf8Kx+DBg9OxY8dccMEFOeGEE7LDDjvkrbfeykknndQg0rp165ahQ4dm7NixOfjggxs9F4ANjdAANnh9+/bNf//v/z3Jklc0Zs+enaeeeipXXXVV5s6dmx/84Ad5/vnns3DhwhxyyCENnrvLLrukW7duefbZZzNs2LAkydlnn50//OEPOfXUU9O9e/ecffbZTRrHl7/85eWWdevWLZdffvkKX4lYnTE1xZQpU9K+ffsmb9+Y7t27p3379hk9enS6d++eLbfcMs2bN2/yx8yefvrp9b9gL168OFOmTMmdd96Zo48+OnfdddcKo3CpPn36NOkYu+yyS4PHW2+9dYNP91pW9+7dM2jQoDz88MM57LDDMn/+/Dz22GP59re/nST561//mvnz52fIkCH1sZik/laxp59+OjvssEP9rWuzZ8/Oa6+9ljfeeCPPPPNMktTfbre65wKwvhIawAZvk002yc4779xg2eDBgzN37tzccsstGT58eP17Hjp37rzc8zt37pwPP/ywwf6+8pWv5Lbbbsuee+6Z1q1bN2kcN9xwQ/2nTrVo0SKbbbZZunbt2uj2qzOmpnjnnXfSrVu31XrOimy66aYZOXJkLrnkkhxwwAEN1jVl/926dWvw9zFgwIDss88+9e95GDVqVKPPbdu2bZPG2KZNmwaPq6urVxlChx9+eM4777xMmTIlY8eOzZw5c3LooYcmSWbOnJkkDT5K9+Pef//9JMmbb76Zn/zkJ/nTn/6UFi1aZLvttqv/NLFlj9/UcwFYXwkNYKO10047ZfTo0Xn77bfr34w8bdq0bLfddg22q6mpyTbbbFP/+OWXX85dd92VPn365N57783Xvva19OvXb5XH69mz50pvbVrW6oypKcaNG9fkj9Bd9k3yy9p9993zla98Ja+//np++MMfZuDAgbnhhhvy8ssvr9aYltpkk02y3Xbb5Y033vhEzy/hwAMPzCWXXJLf/va3+ctf/pK99967PgSXvhJ0xRVX5POf//xyz+3cuXPq6upy8sknp0WLFnnggQfSp0+fNG/ePJMmTcrDDz/8WZ4KwDrBp04BG60XXnghzZo1yzbbbJN+/fqlZcuWefTRRxts85e//CXvvvtuBg4cmGTJrT7nnntutt1229x3333p3bt3zjnnnCxYsKD4+Jo6purqVf8onzdvXp599tnsvffeq9y2rq5ulft88sknM3LkyAwfPjwnnHBC+vXrl44dO65y34358MMP8/rrr+dzn/vcaj2vKefeVG3bts1BBx2URx99NE8//XSGDh1av65fv35p0aJFpk6dmp133rn+T/PmzXPllVfm7bffzowZM/L666/n8MMPr1+XJE899VSSJdcVYGPiFQ1gg/fRRx/l+eefr3+8cOHCPPHEE3nwwQdz1FFH1b+J+eSTT851112XFi1aZL/99svbb7+dq6++Ottvv32+8Y1vJElGjRqVCRMm5Fe/+lVat26diy++OEcccUSuuuqqlX607CfRsWPHJo1p6X9t/93vfpcvfelLy32C0fTp0/PLX/4yVVVV6dixY4NrMX369CxcuDATJkzI5ptvnpdeeinTp09f6Xs5FixYkIsvvjjdunXLGWecsdrn9eabbzYYw7Rp03LLLbfko48+yre+9a3V2lf79u0zbdq0PPnkk0Xe83D44YfnqKOOSocOHbL//vvXL99ss83yrW99K1dffXU++uij7L777pk6dWquvvrqVFVVpXfv3mnXrl26deuWe+65J1tuuWXat2+f//N//k/uvPPOJEtiD2BjIjSADd6ECRMafKpTq1atsu222+YHP/hBTjrppPrlZ5xxRjp37py77747999/fzp27Jg
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 5
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Есть выбросы.",
|
|||
|
"id": "3421b8f5f5d2178a"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:51.864318Z",
|
|||
|
"start_time": "2024-12-07T05:04:50.235251Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"df = df[df['Birth year'] > 0]\n",
|
|||
|
"\n",
|
|||
|
"Q1 = df['Birth year'].quantile(0.25)\n",
|
|||
|
"Q3 = df['Birth year'].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df['Birth year'] < (Q1 - threshold)) | (df['Birth year'] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы заменим выбросы на медианное значение\n",
|
|||
|
"median_review_no = df['Birth year'].median()\n",
|
|||
|
"df.loc[outliers, 'Birth year'] = median_review_no\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Birth year'])\n",
|
|||
|
"plt.title('Box Plot для Birth year')\n",
|
|||
|
"plt.xlabel('Birth year')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "40e42ecfac902bc2",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" Id Name \\\n",
|
|||
|
"8 Q297 Diego Velázquez \n",
|
|||
|
"9 Q301 El Greco \n",
|
|||
|
"11 Q307 Galileo Galilei \n",
|
|||
|
"13 Q346 Louis IX of France \n",
|
|||
|
"15 Q353 Blanche of Castile \n",
|
|||
|
"... ... ... \n",
|
|||
|
"1222824 Q76625167 Francisco de Segura \n",
|
|||
|
"1222864 Q76737718 Hans Osara \n",
|
|||
|
"1222928 Q76879920 Luigi Anguissola \n",
|
|||
|
"1222934 Q76887267 Luigi Strozzi \n",
|
|||
|
"1223002 Q77233630 Matteo Gregorio Rossi \n",
|
|||
|
"\n",
|
|||
|
" Short description Gender \\\n",
|
|||
|
"8 Spanish painter (1599-1660) Male \n",
|
|||
|
"9 Greek painter, sculptor and architect Male \n",
|
|||
|
"11 Italian mathematician, physicist, philosopher ... Male \n",
|
|||
|
"13 king of France Male \n",
|
|||
|
"15 Spanish princess and saint, queen consort of F... Female \n",
|
|||
|
"... ... ... \n",
|
|||
|
"1222824 Spanish poet of the ''siglo de oro''. Male \n",
|
|||
|
"1222864 Finnish lieutenant in the Cudgel War Male \n",
|
|||
|
"1222928 16th century Italian painter Male \n",
|
|||
|
"1222934 klerk uit Groothertogdom Toscane (1632-1700) Male \n",
|
|||
|
"1223002 Italian engraver and publisher NaN \n",
|
|||
|
"\n",
|
|||
|
" Country Occupation Birth year \\\n",
|
|||
|
"8 Spain Artist 1599 \n",
|
|||
|
"9 Spain; Republic of Venice Artist 1541 \n",
|
|||
|
"11 Grand Duchy of Tuscany; Duchy of Florence Astronomer 1564 \n",
|
|||
|
"13 France Ruler 1214 \n",
|
|||
|
"15 Kingdom of Castile Regent 1188 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"1222824 Spain Artist 1569 \n",
|
|||
|
"1222864 Finland; Sweden NaN 1560 \n",
|
|||
|
"1222928 NaN Artist 1479 \n",
|
|||
|
"1222934 Grand Duchy of Tuscany Politician 1632 \n",
|
|||
|
"1223002 NaN Publisher 1638 \n",
|
|||
|
"\n",
|
|||
|
" Death year Manner of death Age of death \n",
|
|||
|
"8 1660.0 NaN 61.0 \n",
|
|||
|
"9 1614.0 NaN 73.0 \n",
|
|||
|
"11 1642.0 NaN 78.0 \n",
|
|||
|
"13 1270.0 natural causes 56.0 \n",
|
|||
|
"15 1252.0 NaN 64.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"1222824 1620.0 NaN 51.0 \n",
|
|||
|
"1222864 1601.0 NaN 41.0 \n",
|
|||
|
"1222928 1553.0 NaN 74.0 \n",
|
|||
|
"1222934 1700.0 NaN 68.0 \n",
|
|||
|
"1223002 1702.0 NaN 64.0 \n",
|
|||
|
"\n",
|
|||
|
"[103545 rows x 10 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAufklEQVR4nO3debiVZaH//w+zKJMIoqEmZiCibhCHnMIp9atmiqkpgmUdq6NZWieH49fMIfsG6VFRTHFIy+yQpKfOMW0w5xGOmqF5UFBEUDhMggzCfn5/+HNdbQEZumGz4fW6Li5Zz/Oste+1730v1tv1rLWbVVVVBQAAoKDmjT0AAABg/SM0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAbCe2FB//+qGer8B1nVCA1ivDR48OL169WrwZ7fddsuQIUPy1FNPrZUxjB49eqkx9O7dO7vvvntOPfXUjBkzpnbsNddck169eq3S7U+dOjWnnXZaJk+eXHroRb3xxhtLfR/69OmTT3/607nwwgszY8aMpY4dPXr0R97mmDFjctppp63y9QBY81o29gAA1rQdd9wx3/ve95IkS5YsycyZM/OLX/wiX/7ylzN69Oh88pOfXCvjGD58eLp27Zokqa+vz/Tp03PttdfmlFNOya9+9avssMMOq3W7jz32WB588MGSQ12jvv71r2f//fdPkixcuDATJkzINddck/Hjx+eOO+5Ikmy++eb55S9/mW222eYjb2vUqFF55ZVX1vSQAVgNQgNY77Vr1y59+/ZtsG3vvffOXnvtldGjR+ecc85ZK+Po3bt3ttpqqwbbdtxxx3zmM5/JHXfckYsvvnitjKOxbbPNNg3mY88990yrVq1y/vnn53/+53/yyU9+Mq1bt15qzgBoWpw6BWyQ2rZtmzZt2qRZs2YNtv/Xf/1XBg4cmH79+mWfffbJhRdemNmzZydJ5s6dmwMOOCCHHXZYFi1alOT99wcMGTIk++yzT4NTf1bWVlttlU033TRvvvnmco/5qDGNHj065513XpLkoIMOyrnnnrvc25k4ceJSpy716tUrBx54YIPjDjzwwBUeN2bMmJx44onp16/fR97WyurYsWOS1Objw6dAjR49OjvuuGNGjRqVffbZJ3vssUe++c1v5te//nUmT5681OlS06ZNy5lnnpl+/fpljz32yP/9v/838+bNW+bXXrx4cfbdd998+9vfXmrfIYcckgsuuKB2edSoUTniiCOy0047Zf/9988111yTJUuWNLjOqFGjMnDgwPTt2ze77LJLPve5z+Xee++t7V/WfRk/fvxqfd8A1mVe0QDWe1VVZfHixbW/z5o1Kz/96U+zaNGiHHvssbXjrrvuulx99dU56aSTctZZZ2XSpEm56qqr8uyzz+bf//3f065du1x22WU59dRTc/311+fMM8/MbbfdlieffDI33HBDOnfuvMpjmzlzZmbOnLncU4RWNKb9998/X//61zNixIgMHz78I9/fsWDBgrRo0aJ2etIHt7+sJ7kDBgzIP//zPy/zuLlz5+ZrX/tatt9++/z4xz/OpptummbNmi33tj6svr6+Nh+LFy/OxIkTc9111+VTn/pUtt9+++Veb8mSJbn55ptz2WWXZebMmenfv3/mz5+fcePGZfjw4dlmm23y7rvvJkmuuuqqDB48ONddd13Gjh2bq6++Ou3atVvmq1ctW7bM0Ucfndtvvz1z585Nu3btkrwfU6+99lp++MMfJkl+8pOf5Morr8zJJ5+c8847Ly+++GKuueaaTJkyJT/4wQ+SJD//+c9z6aWX5hvf+Eb69++f2bNn58Ybb8x3vvOd9OvXL1tsscUy78snPvGJFX7fAJoaoQGs955++un06dNnqe1nn3127Qne7NmzM2LEiBx//PG58MILa8f07NkzgwYNyl133ZVBgwZl7733zgknnJAbbrghdXV1ueKKKzJo0KAMGDBgheP4+yfYCxcuzMSJEzNs2LA0b948J5xwwlLHr+yYPoiUZZ2a9ffmz5+fNm3aNDglaXlx1Llz5+UeN2HChMyZMyennXZaDjjggBXe1of967/+a/71X/+1wbZOnTrl9ttvX+F1v/a1r9Xe3/HB1/z706w+CI1DDz209krPXnvtlUcffTRPPPHEcm/32GOPzY033pj77ruvFp933313tt122+y666555513ct111+WEE06ovcKx7777plOnTrngggvypS99KZ/85CczadKkfPnLX24Qad27d8/AgQMzZsyYHHHEEcu9LwDrG6EBrPf69OmT73//+0nef0Vjzpw5eeihh3LllVfm3XffzVlnnZVnn302ixYtypFHHtngurvttlu6d++ep556KoMGDUqSfPe7380jjzySr33ta+nRo0e++93vrtQ4PvOZzyy1rXv37hk6dOgyX4lYlTGtjClTpqRDhw4rffzy9OjRIx06dMioUaPSo0ePbLHFFmnZsuVKf8zsGWecUXuCvXjx4kyZMiW33XZbvvCFL+T2229fZhR+oHfv3iv1NXbbbbcGl7faaqsGn+71YT169Ej//v1zzz335Nhjj82CBQty77335p/+6Z+SJP/93/+dBQsW5MADD6zFYpLaqWKPPvpoPvnJT9ZOXZszZ05effXVvPbaa3nyySeTpHa63areF4CmSmgA671NNtkkO++8c4Nt++67b959992MHDkyQ4YMqb3noUuXLktdv0uXLnnnnXca3N4hhxySm2++OXvttVc22mijlRrHiBEjap861apVq2y66abp1q3bco9flTGtjMmTJ6d79+6rdJ1ladeuXYYPH55LL700hx56aIN9K3P73bt3bzAf/fr1y4ABA2rvebj++uuXe92NN954pcbYtm3bBpebN2++whD6/Oc/n/PPPz9TpkzJmDFjMm/evBx99NFJklmzZiVJg4/S/Xtvv/12kuT111/PhRdemMcffzytWrXKdtttV/s0sQ9//ZW9LwBNldAANlg77bRTRo0alTfeeKP2ZuTp06dnu+22a3DctGnTsvXWW9cuv/zyy7n99tvTu3fv/OIXv8hRRx2Vurq6FX69nj17fuSpTR+2KmNaGWPHjl3pj9D98JvkP2zPPffMIYcckgkTJuTb3/52dt1114wYMSIvv/zyKo3pA5tsskm22267vPbaa6t1/RIOO+ywXHrppfnd736XZ555Jvvss08tBD94JWjYsGHZdtttl7puly5dUl9fn9NOOy2tWrXKr371q/Tu3TstW7bM+PHjc88996zNuwKwTvCpU8AG6/nnn0+LFi2y9dZbp66uLq1bt85vf/vbBsc888wzefPNN7Prrrsmef9Un3PPPTfbbLNN7rzzzuywww4555xzsnDhwuLjW9kxNW++4ofy+fPn56mnnso+++yzwmPr6+tXeJsPPvhghg8fniFDhuRLX/pS6urq0qlTpxXe9vK88847mTBhQj7+8Y+v0vVW5r6vrI033jiHH354fvvb3+bRRx/NwIEDa/vq6urSqlWrvPXWW9l5551rf1q2bJkrrrgib7zxRmbOnJkJEybk85//fG1fkjz00ENJ3v++AmxIvKIBrPfmzp2bZ599tnZ50aJF+dOf/pS77rorJ5xwQu1NzKeddlquvfbatGrVKgcccEDeeOONXHXVVdl+++1zzDHHJEmuv/76jBs3LnfccUc22mijXHLJJTnuuONy5ZVXfuRHy66OTp06rdSYPvi/7b///e/z6U9/eqlPMJoxY0ZuvfXWNGvWLJ06dWrwvZgxY0YWLVqUcePGZbPNNsvf/va3zJgx4yPfy7Fw4cJccskl6d69e77xjW+s8v16/fXXG4xh+vTpGTlyZObOnZuvfOUrq3RbHTp0yPTp0/Pggw8Wec/D5z//+Zxwwgnp2LFjDj744Nr2TTfdNF/5yldy1VVXZe7cudlzzz3z1ltv5aqrrkqzZs2yww47pH379unevXt+/vOfZ4sttkiHDh3y8MMP57bbbkvyfuwBbEiEBrDeGzduXINPdWrTpk222WabnHXWWfnyl79c2/6Nb3wjXbp0yc9+9rP
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 6
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:52.618966Z",
|
|||
|
"start_time": "2024-12-07T05:04:51.884406Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
],
|
|||
|
"id": "2b08e1385cc51e44",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Id 0\n",
|
|||
|
"Name 0\n",
|
|||
|
"Short description 67877\n",
|
|||
|
"Gender 133600\n",
|
|||
|
"Country 335192\n",
|
|||
|
"Occupation 206844\n",
|
|||
|
"Birth year 0\n",
|
|||
|
"Death year 1\n",
|
|||
|
"Manner of death 1168920\n",
|
|||
|
"Age of death 1\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Id False\n",
|
|||
|
"Name False\n",
|
|||
|
"Short description True\n",
|
|||
|
"Gender True\n",
|
|||
|
"Country True\n",
|
|||
|
"Occupation True\n",
|
|||
|
"Birth year False\n",
|
|||
|
"Death year True\n",
|
|||
|
"Manner of death True\n",
|
|||
|
"Age of death True\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n",
|
|||
|
"Short description процент пустых значений: %5.55\n",
|
|||
|
"Gender процент пустых значений: %10.93\n",
|
|||
|
"Country процент пустых значений: %27.42\n",
|
|||
|
"Occupation процент пустых значений: %16.92\n",
|
|||
|
"Death year процент пустых значений: %0.00\n",
|
|||
|
"Manner of death процент пустых значений: %95.62\n",
|
|||
|
"Age of death процент пустых значений: %0.00\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 7
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:52.954779Z",
|
|||
|
"start_time": "2024-12-07T05:04:52.657355Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"# Замена пустых данных на медиану\n",
|
|||
|
"df[\"Birth year\"] = df[\"Birth year\"].fillna(df[\"Birth year\"].median())\n",
|
|||
|
"df[\"Death year\"] = df[\"Death year\"].fillna(df[\"Death year\"].max())\n",
|
|||
|
"df[\"Age of death\"] = df[\"Age of death\"].fillna(df[\"Death year\"] - df[\"Birth year\"] + 1)\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
],
|
|||
|
"id": "f68b156fc306ada1",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Short description процент пустых значений: %5.55\n",
|
|||
|
"Gender процент пустых значений: %10.93\n",
|
|||
|
"Country процент пустых значений: %27.42\n",
|
|||
|
"Occupation процент пустых значений: %16.92\n",
|
|||
|
"Manner of death процент пустых значений: %95.62\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 8
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Manner of death имеет слишком много пустых значений </br> Age of death считается из других полей <br> в выборке их лучше исключить",
|
|||
|
"id": "60f5a960c211b634"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.146614Z",
|
|||
|
"start_time": "2024-12-07T05:04:52.968787Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Функция для создания выборок\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"dropna_df = df.dropna()\n",
|
|||
|
"df_input = dropna_df[[\"Id\", \"Gender\", \"Country\", \"Birth year\", \"Death year\"]].query('Gender == \"Male\" or Gender == \"Female\"')\n",
|
|||
|
"#[[\"Gender\"] == \"Male\" or [\"Gender\"] == \"Female\"]\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
],
|
|||
|
"id": "ae1ee081769ff2aa",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 27438\n",
|
|||
|
"Размер контрольной выборки: 9146\n",
|
|||
|
"Размер тестовой выборки: 9146\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 9
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.249324Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.174250Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Оценка сбалансированности\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Gender'].value_counts()\n",
|
|||
|
" print(f\"Распределение Gender в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент Male: {counts['Male'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент Female: {counts['Female'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
],
|
|||
|
"id": "264b15ff4137ef94",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Gender в обучающей выборке:\n",
|
|||
|
"Gender\n",
|
|||
|
"Male 22999\n",
|
|||
|
"Female 4439\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент Male: 83.82%\n",
|
|||
|
"Процент Female: 16.18%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в контрольной выборке:\n",
|
|||
|
"Gender\n",
|
|||
|
"Male 7721\n",
|
|||
|
"Female 1425\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент Male: 84.42%\n",
|
|||
|
"Процент Female: 15.58%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в тестовой выборке:\n",
|
|||
|
"Gender\n",
|
|||
|
"Male 7707\n",
|
|||
|
"Female 1439\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент Male: 84.27%\n",
|
|||
|
"Процент Female: 15.73%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 10
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Данные не сбалансированы: мужчин больше, чем женщин.",
|
|||
|
"id": "bb251943566dd9d7"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.289751Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.267835Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"from sklearn.utils import resample\n",
|
|||
|
"\n",
|
|||
|
"standard_data = df_input[df_input['Gender'] == 'Male']\n",
|
|||
|
"free_data = df_input[df_input['Gender'] == 'Female']\n",
|
|||
|
"\n",
|
|||
|
"# Уменьшение количества male до количества female\n",
|
|||
|
"standard_undersampled = resample(standard_data,\n",
|
|||
|
" replace=False, # выборка без замены\n",
|
|||
|
" n_samples=len(free_data), # количество образцов для выборки\n",
|
|||
|
" random_state=42) # для воспроизводимости\n",
|
|||
|
"\n",
|
|||
|
"# Объединение данных обратно\n",
|
|||
|
"df_input = pd.concat([standard_undersampled, free_data])\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
],
|
|||
|
"id": "f09dba8137306afc",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Gender в обучающей выборке:\n",
|
|||
|
"Gender\n",
|
|||
|
"Male 4419\n",
|
|||
|
"Female 4344\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент Male: 50.43%\n",
|
|||
|
"Процент Female: 49.57%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в контрольной выборке:\n",
|
|||
|
"Gender\n",
|
|||
|
"Female 1487\n",
|
|||
|
"Male 1434\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент Male: 49.09%\n",
|
|||
|
"Процент Female: 50.91%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Gender в тестовой выборке:\n",
|
|||
|
"Gender\n",
|
|||
|
"Female 1472\n",
|
|||
|
"Male 1450\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент Male: 49.62%\n",
|
|||
|
"Процент Female: 50.38%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 11
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "### 2) Образование студентов",
|
|||
|
"id": "34a46b89a2d7c8dc"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.414989Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.322842Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"StudentsPerformance.csv\")\n",
|
|||
|
"sampled_df = df.sample(frac=0.4)\n",
|
|||
|
"print(df.columns)"
|
|||
|
],
|
|||
|
"id": "b27758ba28415b32",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',\n",
|
|||
|
" 'test preparation course', 'math score', 'reading score',\n",
|
|||
|
" 'writing score'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 12
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.451091Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.438626Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"print(df.shape)\n",
|
|||
|
"df.head()"
|
|||
|
],
|
|||
|
"id": "22da6934e834877c",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 1000 entries, 0 to 999\n",
|
|||
|
"Data columns (total 8 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 gender 1000 non-null object\n",
|
|||
|
" 1 race/ethnicity 1000 non-null object\n",
|
|||
|
" 2 parental level of education 1000 non-null object\n",
|
|||
|
" 3 lunch 1000 non-null object\n",
|
|||
|
" 4 test preparation course 1000 non-null object\n",
|
|||
|
" 5 math score 1000 non-null int64 \n",
|
|||
|
" 6 reading score 1000 non-null int64 \n",
|
|||
|
" 7 writing score 1000 non-null int64 \n",
|
|||
|
"dtypes: int64(3), object(5)\n",
|
|||
|
"memory usage: 62.6+ KB\n",
|
|||
|
"(1000, 8)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
" gender race/ethnicity parental level of education lunch \\\n",
|
|||
|
"0 female group B bachelor's degree standard \n",
|
|||
|
"1 female group C some college standard \n",
|
|||
|
"2 female group B master's degree standard \n",
|
|||
|
"3 male group A associate's degree free/reduced \n",
|
|||
|
"4 male group C some college standard \n",
|
|||
|
"\n",
|
|||
|
" test preparation course math score reading score writing score \n",
|
|||
|
"0 none 72 72 74 \n",
|
|||
|
"1 completed 69 90 88 \n",
|
|||
|
"2 none 90 95 93 \n",
|
|||
|
"3 none 47 57 44 \n",
|
|||
|
"4 none 76 78 75 "
|
|||
|
],
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>gender</th>\n",
|
|||
|
" <th>race/ethnicity</th>\n",
|
|||
|
" <th>parental level of education</th>\n",
|
|||
|
" <th>lunch</th>\n",
|
|||
|
" <th>test preparation course</th>\n",
|
|||
|
" <th>math score</th>\n",
|
|||
|
" <th>reading score</th>\n",
|
|||
|
" <th>writing score</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>group B</td>\n",
|
|||
|
" <td>bachelor's degree</td>\n",
|
|||
|
" <td>standard</td>\n",
|
|||
|
" <td>none</td>\n",
|
|||
|
" <td>72</td>\n",
|
|||
|
" <td>72</td>\n",
|
|||
|
" <td>74</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>group C</td>\n",
|
|||
|
" <td>some college</td>\n",
|
|||
|
" <td>standard</td>\n",
|
|||
|
" <td>completed</td>\n",
|
|||
|
" <td>69</td>\n",
|
|||
|
" <td>90</td>\n",
|
|||
|
" <td>88</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>group B</td>\n",
|
|||
|
" <td>master's degree</td>\n",
|
|||
|
" <td>standard</td>\n",
|
|||
|
" <td>none</td>\n",
|
|||
|
" <td>90</td>\n",
|
|||
|
" <td>95</td>\n",
|
|||
|
" <td>93</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>group A</td>\n",
|
|||
|
" <td>associate's degree</td>\n",
|
|||
|
" <td>free/reduced</td>\n",
|
|||
|
" <td>none</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>57</td>\n",
|
|||
|
" <td>44</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>group C</td>\n",
|
|||
|
" <td>some college</td>\n",
|
|||
|
" <td>standard</td>\n",
|
|||
|
" <td>none</td>\n",
|
|||
|
" <td>76</td>\n",
|
|||
|
" <td>78</td>\n",
|
|||
|
" <td>75</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 13
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.594512Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.586463Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
],
|
|||
|
"id": "ca0b0adc2b6e172a",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"gender 0\n",
|
|||
|
"race/ethnicity 0\n",
|
|||
|
"parental level of education 0\n",
|
|||
|
"lunch 0\n",
|
|||
|
"test preparation course 0\n",
|
|||
|
"math score 0\n",
|
|||
|
"reading score 0\n",
|
|||
|
"writing score 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"gender False\n",
|
|||
|
"race/ethnicity False\n",
|
|||
|
"parental level of education False\n",
|
|||
|
"lunch False\n",
|
|||
|
"test preparation course False\n",
|
|||
|
"math score False\n",
|
|||
|
"reading score False\n",
|
|||
|
"writing score False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 14
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:54.849186Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.744842Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['math score'])\n",
|
|||
|
"plt.title('Box Plot для math score')\n",
|
|||
|
"plt.xlabel('math score')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "9f8f17302f5e065c",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtNUlEQVR4nO3dd5iV9Z3//9cAUhQVFDRZUBZj6G0UJVaURMUSNZYYJWDUjT2uLfavJdiiYguIYok1wQWR1ZhmLkuKBhVXvbIsYhRZ1KgYwQYyAuf3hz/OOkrPJw6Dj8d1cYW5z8057/ueT8bznLnPmZpKpVIJAABAQU0aegAAAGDNIzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAGplKpdLQIzQ45wBg9Sc0gDXC0KFD07Vr13p/+vfvn2HDhuWJJ574XGaYMGHCZ2bo3r17ttpqqxx++OGZPHlydd+f/OQn6dq160rd/+uvv54jjzwyr776aunRG5Vx48blxz/+cfXjxef9lVdeacCpAPi0Zg09AEApPXr0yHnnnZckWbhwYWbPnp2f//znOeKIIzJhwoR89atf/VzmGDlyZNq3b58kWbRoUd56662MGjUqhx56aMaPH59u3bqt0v0+9thjefTRR0uO2iiNHj06W2+9dUOPAcByCA1gjdG6dev069ev3rZtt90222yzTSZMmJDTTz/9c5mje/fu6dixY71tPXr0yC677JKf/exn+dGPfvS5zAEADcmlU8AarVWrVmnRokVqamrqbf/lL3+Z/fbbL7W1tdluu+1y7rnn5p133kmSvP/++9l5550zePDg1NXVJfn4NQHDhg3Ldtttl7fffnul5+jYsWPatm2b1157ban7LGumCRMm5Mwzz0ySfP3rX88ZZ5yx1Pt5+eWXP3MJV9euXTNo0KB6+w0aNGi5+02ePDkHH3xwamtrl3lfnzRp0qR07do1jz/+eIYOHZo+ffpkp512yrhx4/Lmm2/m+OOPT21tbQYOHJhbb7213r+dOnVqjj/++Hzta19Lz549s8MOO+TCCy/Mhx9+WJ351Vdfzb333vuZy6WeffbZfOc730nv3r2z00475aabblrqjEny4Ycf5vzzz8+OO+6YXr16ZfDgwbn55pvr7fPmm2/m9NNPzzbbbJPa2tp897vfzX/9139Vb58/f35GjRqVwYMHp3fv3tl1110zZsyYLFq0qLrP0KFDc+qpp+aEE05Iv379cthhh1X/7WWXXZaBAwemV69e+eY3v5lf/vKXy5wZoDERGsAao1KpZMGCBVmwYEE++uijzJo1KyNGjEhdXV3233//6n7XXXddTj755PTr1y/XXnttjjvuuPzmN7/J0KFD8+GHH6Z169a56KKL8vLLL+f6669Pktx+++2ZNGlSLr744mywwQYrPdvs2bMze/bsbLrppku8fXkz7bTTTjnmmGOSfHxp1rHHHrvUx/rwww/TtGnT3H333dU/AwcOXOK+AwcOXOp+77//fo4++ugkyYgRIzJ27Nhl3tennXzyyRk0aFBuuOGGdO7cOeedd16GDRuWr371q7nuuuvSp0+fXHLJJXnuueeSfPykfsiQIZk3b14uvfTS3Hjjjdlzzz1zxx135Pbbb68ee/v27atzb7TRRtXHO//887PnnntmzJgxqa2tzeWXX56HH354qfNdfPHF+f3vf5/TTz89N998c77+9a/nsssuyz333JMk+eCDD3LwwQdn0qRJ+eEPf5iRI0emRYsWOfzww/Pyyy+nUqnk6KOPzk033ZQDDzww119/fQYPHpyrr766egnfYr/61a+yzjrrZPTo0fm3f/u3VCqVHHfccRk7dmwOO+ywjB49OrW1tTnppJMyceLEFTq/AKs7l04Ba4wnn3wyPXv2/Mz2k08+OV/5yleSJO+8805Gjx6db3/72zn33HOr+3Tp0iVDhgzJPffckyFDhmTbbbfNQQcdlDFjxqRv37658sorM2TIkBV6kr1o0aIsWLAgycfftX755ZdzxRVXpEmTJjnooIM+s/+KzrQ4UpZ0adYnzZs3Ly1atKh3GdnS4miDDTZY6n7Tp0/Pu+++myOPPDI777zzcu/r0/bff//qd+/XXnvtfPvb306fPn3y7//+70mSbt265be//W2efvrp9OnTJ9OmTUv37t1zzTXXpHXr1kk+vvTtT3/6UyZNmpQjjzwyPXr0SPPmzT8zd/Lx5/nggw9OkvTr1y8PPvhg/vznP9eb/ZOeeOKJbLfddtlzzz2TJAMGDMjaa6+dDTfcMEly7733Vn960r179yTJFltskX333TdPPvlkZsyYkcceeyxXXnll9T622267tGzZMtdcc001qpJkrbXWygUXXJDmzZsnSf70pz/lD3/4Q6666qrsscceSZIddtgh8+bNyxVXXJG99torzZr5TzTQuPkqBqwxevbsmQsuuCDJxz/dePfdd/P73/8+V111VebOnZuTTjopzzzzTOrq6rLXXnvV+7f9+/dPhw4d8sQTT2TIkCFJktNOOy1//OMfc/TRR6dz58457bTTVmiOXXbZ5TPbOnTokMsvv3yJ7zS1MjOtiL/97W9Zb731Vnj/pencuXPWW2+9jBs3Lp07d86XvvSlNGvWbIXfWra2trb698VP3vv27Vvd1rZt2yTJe++9lyTZfvvts/322+ejjz7KX//618yYMSPTpk3L22+/nTZt2iz38fr371/9e6tWrdKuXbu8++67S91/wIABGTt2bF5//fUMHDgwAwcOzHHHHVe9ffLkyenYsWM1Mhbf729+85skyeWXX55mzZpl8ODB9e537733zjXXXJMnnniiGhqbbbZZNTKS5PHHH09NTU0GDhxYjdLk40vD7rvvvrzwwgv1HhegMRIawBpjnXXWSe/evett23777TN37tzcdNNNGTZsWPU1D+3atfvMv2/Xrl31Se/i+9t1111zyy23ZJtttknLli1XaI7Ro0dX33VqrbXWStu2bbPxxhsvdf+VmWlFvPrqq+nQocNK/Zslad26dUaOHJkLL7wwu+22W73bVuT+F/9U4pNatWq11P0XLVqUK6+8MnfddVfmzp2bL3/5y+nTp09atGixQvN++r6bNGmyzCg6++yz86UvfSn33Xdfhg8fnuHDh6e2tjbnn39+unXrljlz5lQDaUneeeedtG3bNk2bNq23ffHn/tNr6ZPmzJmTSqWSLbbYYon3/eabbwoNoNETGsAar1evXhk3blxeeeWVrL/++kmSt956K5tttlm9/WbNmpVNNtmk+vG0adNyxx13pHv37vn5z3+evffeu9535JemS5cuy7y06dNWZqYV8fTTT6/wW+h++kXynzZgwIDsuuuumT59ek455ZRsscUWGT16dKZNm7ZSM62IMWPG5NZbb80FF1yQXXfdNeuuu26S5IADDij+WEnSvHnzHHPMMTnmmGPy2muv5eGHH851112XU045JQ888EDWXXfdJf5ujqeffjrrr79+1l9//cyePTsLFy6sFxtvvvlmkv/7ic2SrLvuull77bWrrz35tE6dOv2DRwfQ8LwYHFjjPffcc2natGk22WST9O3bN82bN88vfvGLevs89dRTee2116rfYV6wYEHOOOOMbLrpphk7dmy6deuW008/PfPnzy8+34rO1KTJ8r9kz5s3r/rag+VZtGjRcu/z0UcfzciRIzNs2LAcdthh6du37wpdxrQqJk+enM033zz7779/NTLeeOONTJs2rd67OK3IeVieDz/8MLvttltuueWWJMm//Mu/ZMiQIdlzzz2r7wzWv3//zJw5My+88EL1382fPz8/+MEPMn78+Gy99dZZsGBBfv3rX9e77/vuuy9JsuWWWy718bfeeuvMnTs3lUolvXv3rv6ZNm1aRo0aVe9yKoDGyk80gDXG+++/n2eeeab6cV1dXR566KHcc889Oeigg6ovYj7yyCMzatSorLXWWtl5553zyiuv5Jprrsnmm2+eb33rW0mS66+/PlOmTMnPfvaztGzZMsOHD8+BBx6Yq666aplvLbsq2rRps0I
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 15
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Есть выбросы",
|
|||
|
"id": "386fbfcb08f18b76"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:55.000194Z",
|
|||
|
"start_time": "2024-12-07T05:04:54.880368Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df['math score'].quantile(0.25)\n",
|
|||
|
"Q3 = df['math score'].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df['math score'] < (Q1 - threshold)) | (df['math score'] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы заменим выбросы на медианное значение\n",
|
|||
|
"median_review_no = df['math score'].median()\n",
|
|||
|
"df.loc[outliers, 'math score'] = median_review_no\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['math score'])\n",
|
|||
|
"plt.title('Box Plot для math score')\n",
|
|||
|
"plt.xlabel('math score')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "8856143e19bd7485",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" gender race/ethnicity parental level of education lunch \\\n",
|
|||
|
"17 female group B some high school free/reduced \n",
|
|||
|
"59 female group C some high school free/reduced \n",
|
|||
|
"145 female group C some college free/reduced \n",
|
|||
|
"338 female group B some high school free/reduced \n",
|
|||
|
"466 female group D associate's degree free/reduced \n",
|
|||
|
"787 female group B some college standard \n",
|
|||
|
"842 female group B high school free/reduced \n",
|
|||
|
"980 female group B high school free/reduced \n",
|
|||
|
"\n",
|
|||
|
" test preparation course math score reading score writing score \n",
|
|||
|
"17 none 18 32 28 \n",
|
|||
|
"59 none 0 17 10 \n",
|
|||
|
"145 none 22 39 33 \n",
|
|||
|
"338 none 24 38 27 \n",
|
|||
|
"466 none 26 31 38 \n",
|
|||
|
"787 none 19 38 32 \n",
|
|||
|
"842 completed 23 44 36 \n",
|
|||
|
"980 none 8 24 23 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAr8ElEQVR4nO3debzXc8L//+chLYRCzFyhCaNoPUmNNRpLZDDWIWW9smSMscWMyzLZxr6UlGVclsEV6WLMZm6UdUIGlzGmEFeYUpTQhj6/P1w+XwcRv5dzKvf77XZunPfnfd6f13m9Oue8H+fz/nxOTaVSqQQAAKCg5Rp6AAAAwLJHaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAsZSqVSkMPocGZA4Aln9AAlgn9+/dPu3bt6rx17949AwYMyGOPPVYvYxg9evRnxrDRRhtl0003zSGHHJIJEyZU973iiivSrl27r3T8qVOnZuDAgXnttddKD32pMmrUqPz617+uvv/xvL/66qsNOCoAPq1RQw8AoJSNN944p59+epLkww8/zMyZM3PLLbfk0EMPzejRo/P973+/XsYxdOjQtGrVKkmycOHCzJgxI8OGDcuBBx6Y22+/Pe3bt/9ax33kkUcybty4kkNdKg0fPjw9evRo6GEA8CWEBrDMaN68ebp27Vpn2+abb57NNtsso0ePzuDBg+tlHBtttFHWXnvtOts23njjbL/99vntb3+bX/3qV/UyDgBoSC6dApZpzZo1S5MmTVJTU1Nn++9///vsscceqa2tzRZbbJHTTjstb7/9dpLk3Xffzbbbbps+ffpkwYIFST56TsCAAQOyxRZb5K233vrK41h77bXTsmXLvP7664vc54vGNHr06JxyyilJkh/+8Ic5+eSTF3mcl19++TOXcLVr1y69e/eus1/v3r2/dL8JEyZkv/32S21t7Rce65PGjx+fdu3a5dFHH03//v3TuXPnbLPNNhk1alTeeOONHH300amtrU2vXr1y/fXX1/nY559/PkcffXR+8IMfpEOHDtlqq61y1llnZd68edUxv/baa7nzzjs/c7nU008/nZ/85Cfp1KlTttlmm1xzzTWLHGOSzJs3L2eccUa23nrrdOzYMX369Mm1115bZ5833ngjgwcPzmabbZba2toccMAB+dvf/la9ff78+Rk2bFj69OmTTp06ZYcddsjIkSOzcOHC6j79+/fPCSeckGOOOSZdu3bNwQcfXP3Y888/P7169UrHjh3zox/9KL///e+/cMwASxOhASwzKpVKPvjgg3zwwQd5//33M3369Fx00UVZsGBB9txzz+p+V155ZY477rh07do1l19+eQYNGpQ//elP6d+/f+bNm5fmzZvn7LPPzssvv5yrrroqSXLDDTdk/PjxOeecc7Laaqt95bHNnDkzM2fOzLrrrvu5t3/ZmLbZZpsceeSRST66NOuoo45a5H3Nmzcvyy+/fG677bbqW69evT533169ei1yv3fffTdHHHFEkuSiiy7Krbfe+oXH+rTjjjsuvXv3zogRI9K2bducfvrpGTBgQL7//e/nyiuvTOfOnXPuuefmmWeeSfLRSX2/fv0yd+7cnHfeebn66qvTt2/f3Hjjjbnhhhuqn3urVq2q415zzTWr93fGGWekb9++GTlyZGpra3PBBRfk/vvvX+T4zjnnnDzwwAMZPHhwrr322vzwhz/M+eefnzvuuCNJ8t5772W//fbL+PHjc+KJJ2bo0KFp0qRJDjnkkLz88supVCo54ogjcs0112TvvffOVVddlT59+uTSSy+tXsL3sT/84Q9ZaaWVMnz48Bx22GGpVCoZNGhQbr311hx88MEZPnx4amtr8/Of/zxjxoxZrPkFWNK5dApYZjz++OPp0KHDZ7Yfd9xxWX/99ZMkb7/9doYPH5599tknp512WnWfDTfcMP369csdd9yRfv36ZfPNN8++++6bkSNHpkuXLrn44ovTr1+/xTrJXrhwYT744IMkH/3W+uWXX86FF16Y5ZZbLvvuu+9n9l/cMX0cKZ93adYnzZ07N02aNKlzGdmi4mi11VZb5H6TJ0/O7NmzM3DgwGy77bZfeqxP23PPPau/vV9xxRWzzz77pHPnzvnZz36WJGnfvn3+/Oc/58knn0znzp0zceLEbLTRRrnsssvSvHnzJB9d+vbwww9n/PjxGThwYDbeeOM0btz4M+NOPlrn/fbbL0nStWvX3HvvvfnrX/9aZ+yf9Nhjj2WLLbZI3759kyQ9e/bMiiuumNVXXz1Jcuedd1YfPdloo42SJN26dcvuu++exx9/PK+88koeeeSRXHzxxdVjbLHFFmnatGkuu+yyalQlyQorrJAzzzwzjRs3TpI8/PDDefDBB3PJJZdk5513TpJstdVWmTt3bi688MLssssuadTIj2hg6ea7GLDM6NChQ84888wkHz26MXv27DzwwAO55JJLMmfOnPz85z/PU089lQULFmSXXXap87Hdu3dP69at89hjj6Vfv35JkpNOOikPPfRQjjjiiLRt2zYnnXTSYo1j++23/8y21q1b54ILLvjcV5r6KmNaHP/617+yyiqrLPb+i9K2bdusssoqGTVqVNq2bZvvfOc7adSo0WK/tGxtbW31/z8+ee/SpUt1W8uWLZMk77zzTpJkyy23zJZbbpn3338/L7zwQl555ZVMnDgxb731Vlq0aPGl99e9e/fq/zdr1ixrrLFGZs+evcj9e/bsmVtvvTVTp05Nr1690qtXrwwaNKh6+4QJE7L22mtXI+Pj4/7pT39KklxwwQVp1KhR+vTpU+e4u+66ay677LI89thj1dBYb731qpGRJI8++mhqamrSq1evapQmH10adtddd2XSpEl17hdgaSQ0gGXGSiutlE6dOtXZtuWWW2bOnDm55pprMmDAgOpzHtZYY43PfPwaa6xRPen9+Hg77LBDrrvuumy22WZp2rTpYo1j+PDh1VedWmGFFdKyZcustdZai9z/q4xpcbz22mtp3br1V/qYz9O8efMMHTo0Z511Vnbcccc6ty3O8T9+VOKTmjVrtsj9Fy5cmIsvvjg333xz5syZk+9+97vp3LlzmjRpsljj/fSxl1tuuS+Mol/+8pf5zne+k7vuuitDhgzJkCFDUltbmzPOOCPt27fPrFmzqoH0ed5+++20bNkyyy+/fJ3tH6/9p/8tfdKsWbNSqVTSrVu3zz32G2+8ITSApZ7QAJZ5HTt2zKhRo/Lqq69m1VVXTZLMmDEj6623Xp39pk+fnnXWWaf6/sSJE3PjjTdmo402yi233JJdd921zm/kF2XDDTf8wkubPu2rjGlxPPnkk4v9ErqffpL8p/Xs2TM77LBDJk+enOOPPz7dunXL8OHDM3HixK80psUxcuTIXH/99TnzzDOzww47ZOWVV06S7LXXXsXvK0kaN26cI488MkceeWRef/313H///bnyyitz/PHH55577snKK6/8uX+b48knn8yqq66aVVddNTNnzsyHH35YJzbeeOONJP/vEZvPs/LKK2fFFVesPvfk09q0afP/87MDaHieDA4s85555pksv/zyWWedddKlS5c0btw4v/vd7+rs88QTT+T111+v/ob5gw8+yMknn5x11103t956a9q3b5/Bgwdn/vz5xce3uGNabrkv/5Y9d+7c6nMPvszChQu/9Jjjxo3L0KFDM2DAgBx88MHp0qXLYl3G9HVMmDAhG2ywQfbcc89qZEybNi0TJ06s8ypOizMPX2bevHnZcccdc9111yVJ/u3f/i39+vVL3759q68M1r1790yZMiWTJk2qftz8+fPz05/+NLfffnt69OiRDz74IH/84x/rHPuuu+5KkmyyySaLvP8ePXpkzpw5qVQq6dSpU/Vt4sSJGTZsWJ3LqQCWVh7RAJYZ7777bp566qnq+wsWLMh9992XO+64I/vuu2/1ScwDBw7MsGHDssIKK2TbbbfNq6++mssuuywbbLBBfvzjHydJrrrqqjz33HP57W9/m6ZNm2bIkCHZe++9c8kll3zhS8t+HS1atFi
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 16
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:55.105781Z",
|
|||
|
"start_time": "2024-12-07T05:04:55.087478Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Функция для создания выборок\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"dropna_df = df.dropna()\n",
|
|||
|
"df_input = dropna_df[[\"math score\", \"reading score\", \"writing score\", \"lunch\"]]\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
],
|
|||
|
"id": "19b2227d09187635",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 600\n",
|
|||
|
"Размер контрольной выборки: 200\n",
|
|||
|
"Размер тестовой выборки: 200\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 17
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:55.176635Z",
|
|||
|
"start_time": "2024-12-07T05:04:55.144596Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['lunch'].value_counts()\n",
|
|||
|
" print(f\"Распределение lunch в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент standard: {counts['standard'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент free/reduced: {counts['free/reduced'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
],
|
|||
|
"id": "3e5fe04e72cba1f1",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение lunch в обучающей выборке:\n",
|
|||
|
"lunch\n",
|
|||
|
"standard 394\n",
|
|||
|
"free/reduced 206\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент standard: 65.67%\n",
|
|||
|
"Процент free/reduced: 34.33%\n",
|
|||
|
"\n",
|
|||
|
"Распределение lunch в контрольной выборке:\n",
|
|||
|
"lunch\n",
|
|||
|
"standard 134\n",
|
|||
|
"free/reduced 66\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент standard: 67.00%\n",
|
|||
|
"Процент free/reduced: 33.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение lunch в тестовой выборке:\n",
|
|||
|
"lunch\n",
|
|||
|
"standard 117\n",
|
|||
|
"free/reduced 83\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент standard: 58.50%\n",
|
|||
|
"Процент free/reduced: 41.50%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 18
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:55.208465Z",
|
|||
|
"start_time": "2024-12-07T05:04:55.195804Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"standard_data = df_input[df_input['lunch'] == 'standard']\n",
|
|||
|
"free_data = df_input[df_input['lunch'] == 'free/reduced']\n",
|
|||
|
"\n",
|
|||
|
"# Уменьшение количества male до количества female\n",
|
|||
|
"standard_undersampled = resample(standard_data,\n",
|
|||
|
" replace=False, # выборка без замены\n",
|
|||
|
" n_samples=len(free_data), # количество образцов для выборки\n",
|
|||
|
" random_state=42) # для воспроизводимости\n",
|
|||
|
"\n",
|
|||
|
"# Объединение данных обратно\n",
|
|||
|
"df_input = pd.concat([standard_undersampled, free_data])\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
],
|
|||
|
"id": "89d35c1d76854af5",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение lunch в обучающей выборке:\n",
|
|||
|
"lunch\n",
|
|||
|
"free/reduced 230\n",
|
|||
|
"standard 196\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент standard: 46.01%\n",
|
|||
|
"Процент free/reduced: 53.99%\n",
|
|||
|
"\n",
|
|||
|
"Распределение lunch в контрольной выборке:\n",
|
|||
|
"lunch\n",
|
|||
|
"standard 75\n",
|
|||
|
"free/reduced 67\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент standard: 52.82%\n",
|
|||
|
"Процент free/reduced: 47.18%\n",
|
|||
|
"\n",
|
|||
|
"Распределение lunch в тестовой выборке:\n",
|
|||
|
"lunch\n",
|
|||
|
"standard 84\n",
|
|||
|
"free/reduced 58\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент standard: 59.15%\n",
|
|||
|
"Процент free/reduced: 40.85%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 19
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "3) Зарплаты ",
|
|||
|
"id": "90f5f62d0f207fcd"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:04:55.249310Z",
|
|||
|
"start_time": "2024-12-07T05:04:55.226833Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"ds_salaries.csv\")\n",
|
|||
|
"sampled_df = df.sample(frac=0.4)\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"print(df)"
|
|||
|
],
|
|||
|
"id": "aa48737cf6a10934",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['work_year', 'experience_level', 'employment_type', 'job_title',\n",
|
|||
|
" 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',\n",
|
|||
|
" 'remote_ratio', 'company_location', 'company_size'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
" work_year experience_level employment_type job_title \\\n",
|
|||
|
"0 2023 SE FT Principal Data Scientist \n",
|
|||
|
"1 2023 MI CT ML Engineer \n",
|
|||
|
"2 2023 MI CT ML Engineer \n",
|
|||
|
"3 2023 SE FT Data Scientist \n",
|
|||
|
"4 2023 SE FT Data Scientist \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"3750 2020 SE FT Data Scientist \n",
|
|||
|
"3751 2021 MI FT Principal Data Scientist \n",
|
|||
|
"3752 2020 EN FT Data Scientist \n",
|
|||
|
"3753 2020 EN CT Business Data Analyst \n",
|
|||
|
"3754 2021 SE FT Data Science Manager \n",
|
|||
|
"\n",
|
|||
|
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
|
|||
|
"0 80000 EUR 85847 ES 100 \n",
|
|||
|
"1 30000 USD 30000 US 100 \n",
|
|||
|
"2 25500 USD 25500 US 100 \n",
|
|||
|
"3 175000 USD 175000 CA 100 \n",
|
|||
|
"4 120000 USD 120000 CA 100 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"3750 412000 USD 412000 US 100 \n",
|
|||
|
"3751 151000 USD 151000 US 100 \n",
|
|||
|
"3752 105000 USD 105000 US 100 \n",
|
|||
|
"3753 100000 USD 100000 US 100 \n",
|
|||
|
"3754 7000000 INR 94665 IN 50 \n",
|
|||
|
"\n",
|
|||
|
" company_location company_size \n",
|
|||
|
"0 ES L \n",
|
|||
|
"1 US S \n",
|
|||
|
"2 US S \n",
|
|||
|
"3 CA M \n",
|
|||
|
"4 CA M \n",
|
|||
|
"... ... ... \n",
|
|||
|
"3750 US L \n",
|
|||
|
"3751 US L \n",
|
|||
|
"3752 US S \n",
|
|||
|
"3753 US L \n",
|
|||
|
"3754 IN L \n",
|
|||
|
"\n",
|
|||
|
"[3755 rows x 11 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 20
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:08:29.826827Z",
|
|||
|
"start_time": "2024-12-07T05:08:29.809667Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"print(df.shape)\n",
|
|||
|
"df.head()"
|
|||
|
],
|
|||
|
"id": "efe4a65727cb2a42",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 3755 entries, 0 to 3754\n",
|
|||
|
"Data columns (total 11 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 work_year 3755 non-null int64 \n",
|
|||
|
" 1 experience_level 3755 non-null object\n",
|
|||
|
" 2 employment_type 3755 non-null object\n",
|
|||
|
" 3 job_title 3755 non-null object\n",
|
|||
|
" 4 salary 3755 non-null int64 \n",
|
|||
|
" 5 salary_currency 3755 non-null object\n",
|
|||
|
" 6 salary_in_usd 3755 non-null int64 \n",
|
|||
|
" 7 employee_residence 3755 non-null object\n",
|
|||
|
" 8 remote_ratio 3755 non-null int64 \n",
|
|||
|
" 9 company_location 3755 non-null object\n",
|
|||
|
" 10 company_size 3755 non-null object\n",
|
|||
|
"dtypes: int64(4), object(7)\n",
|
|||
|
"memory usage: 322.8+ KB\n",
|
|||
|
"(3755, 11)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
" work_year experience_level employment_type job_title \\\n",
|
|||
|
"0 2023 SE FT Principal Data Scientist \n",
|
|||
|
"1 2023 MI CT ML Engineer \n",
|
|||
|
"2 2023 MI CT ML Engineer \n",
|
|||
|
"3 2023 SE FT Data Scientist \n",
|
|||
|
"4 2023 SE FT Data Scientist \n",
|
|||
|
"\n",
|
|||
|
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
|
|||
|
"0 80000 EUR 85847 ES 100 \n",
|
|||
|
"1 30000 USD 30000 US 100 \n",
|
|||
|
"2 25500 USD 25500 US 100 \n",
|
|||
|
"3 175000 USD 175000 CA 100 \n",
|
|||
|
"4 120000 USD 120000 CA 100 \n",
|
|||
|
"\n",
|
|||
|
" company_location company_size \n",
|
|||
|
"0 ES L \n",
|
|||
|
"1 US S \n",
|
|||
|
"2 US S \n",
|
|||
|
"3 CA M \n",
|
|||
|
"4 CA M "
|
|||
|
],
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>work_year</th>\n",
|
|||
|
" <th>experience_level</th>\n",
|
|||
|
" <th>employment_type</th>\n",
|
|||
|
" <th>job_title</th>\n",
|
|||
|
" <th>salary</th>\n",
|
|||
|
" <th>salary_currency</th>\n",
|
|||
|
" <th>salary_in_usd</th>\n",
|
|||
|
" <th>employee_residence</th>\n",
|
|||
|
" <th>remote_ratio</th>\n",
|
|||
|
" <th>company_location</th>\n",
|
|||
|
" <th>company_size</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>SE</td>\n",
|
|||
|
" <td>FT</td>\n",
|
|||
|
" <td>Principal Data Scientist</td>\n",
|
|||
|
" <td>80000</td>\n",
|
|||
|
" <td>EUR</td>\n",
|
|||
|
" <td>85847</td>\n",
|
|||
|
" <td>ES</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>ES</td>\n",
|
|||
|
" <td>L</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>MI</td>\n",
|
|||
|
" <td>CT</td>\n",
|
|||
|
" <td>ML Engineer</td>\n",
|
|||
|
" <td>30000</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>30000</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>MI</td>\n",
|
|||
|
" <td>CT</td>\n",
|
|||
|
" <td>ML Engineer</td>\n",
|
|||
|
" <td>25500</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>25500</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>SE</td>\n",
|
|||
|
" <td>FT</td>\n",
|
|||
|
" <td>Data Scientist</td>\n",
|
|||
|
" <td>175000</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>175000</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>M</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>SE</td>\n",
|
|||
|
" <td>FT</td>\n",
|
|||
|
" <td>Data Scientist</td>\n",
|
|||
|
" <td>120000</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>120000</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>M</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 21
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:09:03.492543Z",
|
|||
|
"start_time": "2024-12-07T05:09:03.484743Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
],
|
|||
|
"id": "7fe305e3322450b0",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"work_year 0\n",
|
|||
|
"experience_level 0\n",
|
|||
|
"employment_type 0\n",
|
|||
|
"job_title 0\n",
|
|||
|
"salary 0\n",
|
|||
|
"salary_currency 0\n",
|
|||
|
"salary_in_usd 0\n",
|
|||
|
"employee_residence 0\n",
|
|||
|
"remote_ratio 0\n",
|
|||
|
"company_location 0\n",
|
|||
|
"company_size 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"work_year False\n",
|
|||
|
"experience_level False\n",
|
|||
|
"employment_type False\n",
|
|||
|
"job_title False\n",
|
|||
|
"salary False\n",
|
|||
|
"salary_currency False\n",
|
|||
|
"salary_in_usd False\n",
|
|||
|
"employee_residence False\n",
|
|||
|
"remote_ratio False\n",
|
|||
|
"company_location False\n",
|
|||
|
"company_size False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 22
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:10:52.411701Z",
|
|||
|
"start_time": "2024-12-07T05:10:52.310163Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['salary_in_usd'])\n",
|
|||
|
"plt.title('Box Plot для salary_in_usd')\n",
|
|||
|
"plt.xlabel('math salary_in_usd')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "ae7082a6ae02ce66",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8RElEQVR4nO3deVxVdcLH8S+ICC4IJmKpzZgKuCCrioqZ5JJbmlZOWjppqWmWjWvL5BSZmpaWlsu4W1puWdqj5TyN9ZQ7KC1oai6puWAoKKIInOcPhzNcFgH96UX9vF+vXuO959xzfvfc38D9cO+518WyLEsAAAAAYJCrswcAAAAA4NZDaAAAAAAwjtAAAAAAYByhAQAAAMA4QgMAAACAcYQGAAAAAOMIDQAAAADGERoAAAAAjCM0AKCE4PtTAQC3EkIDQIn2xBNPKCAgwOG/iIgI9e7dW1u3br0hY1i5cmWeMdStW1eNGjVS3759FRsba687depUBQQEFGv7x48fV//+/XX06FHTQy+RoqOjNXr0aGcPwzZ69GhFR0c7exjFkj0njxw54uyhAECB3Jw9AAAoTL169TRmzBhJUmZmpk6fPq0lS5aoX79+WrlyperUqXNDxjFt2jT5+vpKkrKysnTq1Cm9//776tOnj5YvX67AwMCr2u7GjRv1zTffmBwqimHQoEHq3bu3s4cBALccQgNAiVe+fHmFhIQ4XNesWTM1bdpUK1eu1KhRo27IOOrWravq1as7XFevXj21adNGixcv1uuvv35DxgGz7r77bmcPAQBuSbx1CsBNydPTU2XKlJGLi4vD9f/zP/+jbt26KTQ0VM2bN9err76q5ORkSdK5c+fUqlUrPfDAA0pPT5d0+byI3r17q3nz5kpKSir2OKpXry4fHx/9/vvvBa5zpTGtXLlSL774oiTp/vvvv+Jbig4ePJjnLVwBAQF53vYTHR1d6HqxsbF67LHHFBoaesVt5bZgwQI98MADCgoKUosWLfSPf/xD586ds5cnJSXptddeU6tWrdSgQQM1btxYgwcPvuJbfI4cOaKRI0cqKipK9evXV9OmTTVy5EidPn3a4T69+eab6tOnjxo2bKiXX35ZUVFRGjZsWJ7ttW3bVq+88soV70dOud86FR0drffee08TJkxQs2bN1LBhQ/Xr108HDx4s8jalgt9GFxAQoKlTp9qX16xZowcffFANGzZUZGSkhg8frhMnTtjLs7Ky9MEHH+i+++5TcHCwBg0aZM8fACjJeEUDQIlnWZYyMjLsf585c0YLFixQenq6unfvbq/3wQcf6L333lPPnj31wgsv6PDhw3r33Xe1c+dOLV26VOXLl9fYsWPVt29fzZgxQ88995wWLlyoLVu2aNasWapUqVKxx3b69GmdPn26wL+KFzam++67T88884ymT5+uadOmXfH8jgsXLqhUqVJavHixw/b37duXZ92WLVtq0KBB+a537tw5DRw4ULVr19bbb78tHx8fubi4FLitbGvWrNHEiRM1atQoBQQEaP/+/ZowYYLS0tI0YcIEWZalAQMGKDk5WcOHD1flypX1yy+/aMqUKRozZozmzJmTZ5tpaWnq3bu3fHx8NGbMGFWoUEE7duzQtGnT5OHh4fAq0UcffaQnn3xSTz/9tMqVKycfHx8tWrRI586dU/ny5SVdDqhDhw5p/PjxBd6Poli4cKHCw8M1btw4JScna+zYsRo1apQ++eSTa9pubrGxsRo5cqQGDRqkRo0a6fjx45o4caKGDRumDz/8UJI0ceJELVy4UM8884yCg4O1du1avf3220bHAQDXA6EBoMTbtm2b6tevn+f6v/3tb6pVq5YkKTk5WdOnT9ejjz6qV1991V7H399fvXr10ooVK9SrVy81a9ZMPXr00KxZsxQcHKx33nlHvXr1UsuWLQsdR1ZWlh08Fy9e1MGDBzVp0iS5urqqR48eedYv6piyIyW/t2bllJaWpjJlyji8jaygOKpUqVKB6x04cEApKSnq37+/WrVqVei2sm3dulXVq1dXr1695OrqqsaNG6ts2bL2X9dPnjwpT09PjRo1ShEREZKkJk2a6LfffivwCfrBgwdVtWpVTZgwQTVq1JAkRUZGKj4+Ps/J/nfddZeGDx9uX/b29tY///lPffnll3Zwrlq1Sn/+858VFhZ2xftSGC8vL33wwQcqVaqUJOm3337T1KlTdfr0afn4+FzTtnOKjY2Vh4eH+vfvL3d3d0mX79ePP/4oy7J09uxZLVq0SE8++aSeffZZSVKLFi108uRJ/d///Z+xcQDA9UBoACjx6tevr9dee03S5Vc0UlJS9O2332ry5Mk6f/68XnjhBe3cuVPp6enq1KmTw20jIiJUrVo1bd26Vb169ZIkjRw5Ut99950GDhyomjVrauTIkUUaR5s2bfJcV61aNU2cODHfVyKKM6aiOHbsmLy8vIq8fkFq1qwpLy8vLVu2TDVr1lTVqlXl5uZW6MfrRkZG6pNPPlG3bt3UunVrtWzZUp07d7bfvubn56eFCxfKsiwdOXJEhw4d0v79+xUXF2e/VS23unXravHixcrKytLBgwd16NAh7du3T/v377ejLue6ue9HeHi4PvvsM3Xv3l0XLlzQ2rVr9fTTT1/D0bksKCjIjgxJqlq1qqTLsWcyNBo1aqTJkyerU6dOateunVq2bKmoqCg7fHfu3KlLly45BKEktW/fntAAUOIRGgBKvHLlyikoKMjhuqioKJ0/f16zZ89W79697b+qV65cOc/tK1eurLNnzzpsr23btpo7d66aNm0qDw+PIo1j+vTp9qdOlS5dWj4+PvLz8ytw/eKMqSiOHj2qatWqFes2+SlfvrymTZumN954Q+3atXNYdqXtd+jQQVlZWVq8eLE++OADTZ06VdWqVdPw4cPVoUMHSdLnn3+ud955R8eOHZO3t7fq1q1b6PGdN2+eZsyYoTNnzqhy5cpq0KCBPD098xyfsmXL5rntww8/rJdeeknHjh1TbGysUlNT1bVr1yIeiYJ5eno6XHZ1vXxKY1ZW1jVvO6fQ0FDNmjVL8+fP17x58zRr1ixVrlxZAwcO1BNPPGHPodxxkz0PAaAk42RwADetBg0aKCMjQ0eOHFHFihUlSadOncqzXmJiosMTtT179mjRokWqW7eulixZovj4+CLtz9/fX0FBQQoKClJgYOAVI0NSscZUFHFxcUX+CN3cJ8nn1qRJE7Vt21alS5fW6NGjtXTp0jx/Nc9Pp06dtHjxYm3ZskVTpkyRt7e3RowYoRMnTmj79u0aNWqU2rZtq2+//VZbtmzR/Pnz83xiWE6rV6/W+PHj9fTTT2vTpk36/vvvNXPmTP35z38u0v184IEHVLZsWa1bt05r165V8+bNC31cbpTsxyAzM9O+LjU1Nc96LVq00Jw5c7Rt2zbNmDFD/v7+euONN/TDDz/Yc+SPP/5wuM2ZM2eu38ABwBBCA8BN64cfflCpUqVUo0YNBQcHy93dXWvWrHFYZ/v27fr999/t9+xnZGRo9OjRuvvuu/Xxxx8rMDBQo0aN0sWLF42Pr6hjyv5r+ZWkpaVp69atat68eaHrZmVlFbrNb775RtOmTVPv3r315JNPKjg4WN7e3le8zdChQzV48GBJUoUKFdS+fXsNGjRIGRkZOnnypHbs2KGsrCwNGTLEfrKfmZmpjRs32uPKLTY2Vl5eXnrqqafsc0RSU1MVGxtbpFcPypYtqw4dOmjNmjX6/vvv1a1bt0Jvc6Nkn6B+/Phx+7qcX+4oSRMmTFD37t1lWZY8PT3VqlUr++Oaf//9d4WGhsrDw0Pr1q1zuN2///3v6zx6ALh2vHUKQIl37tw57dy5076cnp6ur7/+WitWrFCPHj3sJ6j9+/fX+++/r9KlS6tVq1Y6cuSI3n33XdWuXVsPPfSQJGnGjBlKSEjQ4sWL5eHhoZiYGD3yyCOaPHmy8W+r9vb2LtKYss+7WL9+ve699177BPdsSUlJmj9/vlxcXOTt7e1wLJKSkpSenq6EhATdcccd+uWXX5SUlHTFczkuXryomJgYVatWTUOGDCny/YmMjNSYMWM
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 23
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Есть выбросы",
|
|||
|
"id": "3f22eadffa90565b"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:12:21.725609Z",
|
|||
|
"start_time": "2024-12-07T05:12:21.617687Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df['salary_in_usd'].quantile(0.25)\n",
|
|||
|
"Q3 = df['salary_in_usd'].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df['salary_in_usd'] < (Q1 - threshold)) | (df['salary_in_usd'] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы заменим выбросы на медианное значение\n",
|
|||
|
"median_review_no = df['salary_in_usd'].median()\n",
|
|||
|
"df.loc[outliers, 'salary_in_usd'] = median_review_no\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['salary_in_usd'])\n",
|
|||
|
"plt.title('Box Plot для salary_in_usd')\n",
|
|||
|
"plt.xlabel('salary_in_usd')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "b5fa84c701764a4",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" work_year experience_level employment_type \\\n",
|
|||
|
"33 2023 SE FT \n",
|
|||
|
"68 2023 SE FT \n",
|
|||
|
"83 2022 EN FT \n",
|
|||
|
"133 2023 SE FT \n",
|
|||
|
"145 2023 SE FT \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"3522 2020 MI FT \n",
|
|||
|
"3675 2021 EX CT \n",
|
|||
|
"3697 2020 EX FT \n",
|
|||
|
"3747 2021 MI FT \n",
|
|||
|
"3750 2020 SE FT \n",
|
|||
|
"\n",
|
|||
|
" job_title salary salary_currency \\\n",
|
|||
|
"33 Computer Vision Engineer 342810 USD \n",
|
|||
|
"68 Applied Scientist 309400 USD \n",
|
|||
|
"83 AI Developer 300000 USD \n",
|
|||
|
"133 Machine Learning Engineer 342300 USD \n",
|
|||
|
"145 Machine Learning Engineer 318300 USD \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"3522 Research Scientist 450000 USD \n",
|
|||
|
"3675 Principal Data Scientist 416000 USD \n",
|
|||
|
"3697 Director of Data Science 325000 USD \n",
|
|||
|
"3747 Applied Machine Learning Scientist 423000 USD \n",
|
|||
|
"3750 Data Scientist 412000 USD \n",
|
|||
|
"\n",
|
|||
|
" salary_in_usd employee_residence remote_ratio company_location \\\n",
|
|||
|
"33 342810 US 0 US \n",
|
|||
|
"68 309400 US 0 US \n",
|
|||
|
"83 300000 IN 50 IN \n",
|
|||
|
"133 342300 US 0 US \n",
|
|||
|
"145 318300 US 100 US \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"3522 450000 US 0 US \n",
|
|||
|
"3675 416000 US 100 US \n",
|
|||
|
"3697 325000 US 100 US \n",
|
|||
|
"3747 423000 US 50 US \n",
|
|||
|
"3750 412000 US 100 US \n",
|
|||
|
"\n",
|
|||
|
" company_size \n",
|
|||
|
"33 M \n",
|
|||
|
"68 L \n",
|
|||
|
"83 L \n",
|
|||
|
"133 L \n",
|
|||
|
"145 M \n",
|
|||
|
"... ... \n",
|
|||
|
"3522 M \n",
|
|||
|
"3675 S \n",
|
|||
|
"3697 L \n",
|
|||
|
"3747 L \n",
|
|||
|
"3750 L \n",
|
|||
|
"\n",
|
|||
|
"[63 rows x 11 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAyAAAAIhCAYAAAC2W955AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAyN0lEQVR4nO3dd5hU5f347/eidIRFKYlgV4qydCkCQYgiUozdiIpdo1iIDeInVjRqiIqICgZF1GCLRg2/gDExGiwBgrjRYEMFBJUSivS25/uHP+dypSyoPEu57+vykp1z5swz++yzzIuZM5OXZVkWAAAACZQp7QEAAAA7DgECAAAkI0AAAIBkBAgAAJCMAAEAAJIRIAAAQDICBAAASEaAAAAAyQgQgK2Ez4UFYEcgQICt2mmnnRb169cv9l/Lli2jd+/eMWHChCRjeOaZZ9YZQ8OGDePggw+Os846KyZNmpTb9+6774769etv1vG/+OKLOO+882LWrFk/9NC3Sp07d47+/fuX9jBy+vfvH507dy7tYWyWr38mZ86cWdpDAdhsO5f2AABKcuCBB8Z1110XERFr166NBQsWxGOPPRZnn312PPPMM3HAAQckGceQIUOiZs2aERFRVFQU8+bNi3vuuSdOP/30+OMf/xgNGjT4Tsd9/fXX45VXXvkhh8pmuPDCC6N3796lPQyAHYYAAbZ6VapUiaZNmxa77JBDDom2bdvGM888E/369UsyjoYNG0bdunWLXXbggQfG4YcfHqNGjYobb7wxyTj4Ye25556lPQSAHYqXYAHbpIoVK0b58uUjLy+v2OV/+ctf4thjj41mzZpFu3bt4tprr41FixZFRMSSJUuiU6dO0bVr11i1alVEfHXeRe/evaNdu3Yxf/78zR5H3bp1o3r16vHZZ59tcJ+NjemZZ56JX/3qVxER8dOf/nSjL02aNm3aOi8Fq1+//jovH+rcuXOJ+02aNClOPvnkaNas2UaP9W0jR46Mrl27RkFBQXTo0CGuv/76WLJkSW77/Pnz44YbbohOnTpFo0aNolWrVtGnT5+NvlRo5syZcdVVV0X79u3joIMOirZt28ZVV10VCxYsKHaffvOb38Tpp58ejRs3jv/7v/+L9u3bx+WXX77O8bp06RK//vWvN3o/vunbL8Hq3LlzDB48OG677bY45JBDonHjxnH22WfHtGnTNvmYERt+OV79+vXj7rvvzn09evToOOqoo6Jx48bRpk2buOKKK2L27Nm57UVFRXHvvffGoYceGk2aNIkLL7ww9/MDsC3yDAiw1cuyLNasWZP788KFC2PkyJGxatWqOO6443L73XvvvTF48ODo1atX/PKXv4xPP/007rrrrnjrrbfiySefjCpVqsTNN98cZ511VgwdOjQuueSSePjhh2P8+PFx//33x6677rrZY1uwYEEsWLBgg/+KXtKYDj300LjgggvivvvuiyFDhmz0/JEVK1bETjvtFKNGjSp2/KlTp66zb8eOHePCCy9c735LliyJX/ziF7H//vvH7bffHtWrV4+8vLwNHutro0ePjoEDB0a/fv2ifv368fHHH8dtt90Wy5cvj9tuuy2yLIvzzz8/Fi1aFFdccUXUqFEj3n///Rg0aFBcd9118cADD6xzzOXLl0fv3r2jevXqcd1118Uuu+wSkydPjiFDhkSFChWKPav0hz/8Ic4888w499xzo3LlylG9evV45JFHYsmSJVGlSpWI+Cqspk+fHrfeeusG78emePjhh6NFixZxyy23xKJFi+Lmm2+Ofv36xRNPPPG9jvttkyZNiquuuiouvPDCOPjgg+OLL76IgQMHxuWXXx6PPvpoREQMHDgwHn744bjggguiSZMmMWbMmLj99tt/0HEApCRAgK3exIkT46CDDlrn8ssuuyz222+/iIhYtGhR3HfffXHiiSfGtddem9unXr16ccopp8TTTz8dp5xyShxyyCFx0kknxf333x9NmjSJO+64I0455ZTo2LFjieMoKirKhdDKlStj2rRp8bvf/S7KlCkTJ5100jr7b+qYvo6X9b3E65uWL18e5cuXL/ZytA1F06677rrB/T755JP48ssv47zzzotOnTqVeKyvTZgwIerWrRunnHJKlClTJlq1ahWVKlXK/Wv8nDlzomLFitGvX79o2bJlRES0bt06ZsyYscEH7tOmTYsf/ehHcdttt8Uee+wRERFt2rSJwsLCdd5kYPfdd48rrrgi93V+fn78/ve/jxdeeCEXos8++2zsvffe0bx5843el5JUrVo17r333thpp50iImLGjBlx9913x4IFC6J69erf69jfNGnSpKhQoUKcd955Ua5cuYj46n69/fbbkWVZLF68OB555JE488wz46KLLoqIiA4dOsScOXNi3LhxP9g4AFISIMBW76CDDoobbrghIr56BuTLL7+Mf/7zn3HnnXfGsmXL4pe//GW89dZbsWrVqujRo0ex67Zs2TLq1KkTEyZMiFNOOSUiIq666qp49dVX4xe/+EXss88+cdVVV23SOA4//PB1LqtTp04MHDhwvc9cbM6YNsXnn38eVatW3eT9N2SfffaJqlWrxlNPPRX77LNP/OhHP4qdd965xLcBbtOmTTzxxBNx7LHHxmGHHRYdO3aMnj175l4GV7t27Xj44Ycjy7KYOXNmTJ8+PT7++ON48803cy95+7aGDRvGqFGjoqioKKZNmxbTp0+PqVOnxscff5yLvW/u++370aJFi3juuefiuOOOixUrVsSYMWPi3HPP/R7fna8UFBTk4iMi4kc/+lFEfBWBP2SAHHzwwXHnnXdGjx494ogjjoiOHTtG+/btc0H81ltvxerVq4uFYkTEkUceKUCAbZYAAbZ6lStXjoKCgmKXtW/fPpYtWxbDhw+P3r175/4VvkaNGutcv0aNGrF48eJix+vSpUs8+OCD0bZt26hQocImjeO+++7LvQtW2bJlo3r16lG7du0N7r85Y9oUs2bNijp16mzWddanSpUqMWTIkLjpppviiCOOKLZtY8fv1q1bFBUVxahRo+Lee++Nu+++O+rUqRNXXHFFdOvWLSIinn/++bjjjjvi888/j/z8/GjYsGGJ398RI0bE0KFDY+HChVGjRo1o1KhRVKxYcZ3vT6VKlda57vHHHx9XX311fP755zFp0qRYunRpHH300Zv4ndiwihUrFvu6TJmvTpksKir63sf+pmbNmsX9998fDz30UIwYMSLuv//+qFGjRvziF7+I0047Lfcz9O3o+frnEGBb5CR0YJvVqFGjWLNmTcycOTOqVasWERHz5s1bZ7+5c+cWewD3wQcfxCOPPBINGzaMxx57LAoLCzfp9urVqxcFBQVRUFAQDRo02Gh8RMRmjWlTvPnmm5v8Vr/fPjn/21q3bh1dunSJsmXLRv/+/ePJJ59c51/Z16dHjx4xatSoGD9+fAwaNCjy8/PjyiuvjNmzZ8e///3v6NevX3Tp0iX++c9/xvjx4+Ohhx5a5x3MvunPf/5z3HrrrXHuuefGG2+8Ea+99loMGzYs9t577026n127do1KlSrF2LFjY8yYMdGuXbsS5yWVr+dg7dq1ucuWLl26zn4dOnSIBx54ICZOnBhDhw6NevXqxU033RT/+c9/cj8j//vf/4pdZ+HChVtu4ABbmAABtln/+c9/Yqeddoo99tgjmjRpEuXKlYvRo0cX2+ff//53fPbZZ7lzAtasWRP9+/ePPffcMx5//PFo0KBB9OvXL1auXPmDj29Tx/T1v65vzPLly2PChAnRrl27EvctKioq8ZivvPJKDBkyJHr37h1nnnlmNGnSJPLz8zd6nb59+0afPn0iImKXXXaJI488Mi688MJYs2ZNzJkzJyZPnhxFRUVx8cUX5yJg7dq18frrr+fG9W2TJk2KqlWrxjnnnJM7B2Xp0qUxadKkTXq2oVKlStGtW7cYPXp0vPbaa3HssceWeJ1Uvj4x/osvvshd9s0PrYyIuO222+K4446LLMuiYsWK0alTp9zbSn/22WfRrFmzqFChQowdO7bY9f7xj39s4dEDbDleggVs9ZYsWRJvvfVW7utVq1b
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 24
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:15:42.200731Z",
|
|||
|
"start_time": "2024-12-07T05:15:42.186361Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Функция для создания выборок\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"dropna_df = df.dropna()\n",
|
|||
|
"df_input = dropna_df[[\"salary_in_usd\", \"employee_residence\", \"company_size\", \"work_year\", \"company_location\"]]\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
],
|
|||
|
"id": "77ab4b875d5c36f2",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 2253\n",
|
|||
|
"Размер контрольной выборки: 751\n",
|
|||
|
"Размер тестовой выборки: 751\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 25
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T05:18:01.515059Z",
|
|||
|
"start_time": "2024-12-07T05:18:01.503530Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['company_size'].value_counts()\n",
|
|||
|
" print(f\"Распределение company_size в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент L: {counts['L'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент M: {counts['M'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент S: {counts['S'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
],
|
|||
|
"id": "bdd7580094a49586",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение company_size в обучающей выборке:\n",
|
|||
|
"company_size\n",
|
|||
|
"M 1893\n",
|
|||
|
"L 276\n",
|
|||
|
"S 84\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент L: 12.25%\n",
|
|||
|
"Процент M: 84.02%\n",
|
|||
|
"Процент S: 3.73%\n",
|
|||
|
"\n",
|
|||
|
"Распределение company_size в контрольной выборке:\n",
|
|||
|
"company_size\n",
|
|||
|
"M 622\n",
|
|||
|
"L 97\n",
|
|||
|
"S 32\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент L: 12.92%\n",
|
|||
|
"Процент M: 82.82%\n",
|
|||
|
"Процент S: 4.26%\n",
|
|||
|
"\n",
|
|||
|
"Распределение company_size в тестовой выборке:\n",
|
|||
|
"company_size\n",
|
|||
|
"M 638\n",
|
|||
|
"L 81\n",
|
|||
|
"S 32\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент L: 10.79%\n",
|
|||
|
"Процент M: 84.95%\n",
|
|||
|
"Процент S: 4.26%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 27
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Данные не сбалансированы",
|
|||
|
"id": "cb6aa8715719d743"
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 2
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython2",
|
|||
|
"version": "2.7.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|