AIM-PIbd-32-Kaznacheeva-E-K/lab_4/Lab4.ipynb

1949 lines
265 KiB
Plaintext
Raw Normal View History

2024-11-09 11:59:00 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Начало лабораторной работы**"
]
},
2024-11-23 12:17:48 +04:00
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['work_year', 'experience_level', 'employment_type', 'job_title',\n",
" 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',\n",
" 'remote_ratio', 'company_location', 'company_size'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Principal Data Scientist</td>\n",
" <td>80000</td>\n",
" <td>EUR</td>\n",
" <td>85847</td>\n",
" <td>ES</td>\n",
" <td>100</td>\n",
" <td>ES</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>CT</td>\n",
" <td>ML Engineer</td>\n",
" <td>30000</td>\n",
" <td>USD</td>\n",
" <td>30000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>CT</td>\n",
" <td>ML Engineer</td>\n",
" <td>25500</td>\n",
" <td>USD</td>\n",
" <td>25500</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>175000</td>\n",
" <td>USD</td>\n",
" <td>175000</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>120000</td>\n",
" <td>USD</td>\n",
" <td>120000</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>M</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type job_title \\\n",
"0 2023 SE FT Principal Data Scientist \n",
"1 2023 MI CT ML Engineer \n",
"2 2023 MI CT ML Engineer \n",
"3 2023 SE FT Data Scientist \n",
"4 2023 SE FT Data Scientist \n",
"\n",
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
"0 80000 EUR 85847 ES 100 \n",
"1 30000 USD 30000 US 100 \n",
"2 25500 USD 25500 US 100 \n",
"3 175000 USD 175000 CA 100 \n",
"4 120000 USD 120000 CA 100 \n",
"\n",
" company_location company_size \n",
"0 ES L \n",
"1 US S \n",
"2 US S \n",
"3 CA M \n",
"4 CA M "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>salary</th>\n",
" <th>salary_in_usd</th>\n",
" <th>remote_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>3755.000000</td>\n",
" <td>3.755000e+03</td>\n",
" <td>3755.000000</td>\n",
" <td>3755.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>2022.373635</td>\n",
" <td>1.906956e+05</td>\n",
" <td>137570.389880</td>\n",
" <td>46.271638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.691448</td>\n",
" <td>6.716765e+05</td>\n",
" <td>63055.625278</td>\n",
" <td>48.589050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2020.000000</td>\n",
" <td>6.000000e+03</td>\n",
" <td>5132.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2022.000000</td>\n",
" <td>1.000000e+05</td>\n",
" <td>95000.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>2022.000000</td>\n",
" <td>1.380000e+05</td>\n",
" <td>135000.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2023.000000</td>\n",
" <td>1.800000e+05</td>\n",
" <td>175000.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>2023.000000</td>\n",
" <td>3.040000e+07</td>\n",
" <td>450000.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" work_year salary salary_in_usd remote_ratio\n",
"count 3755.000000 3.755000e+03 3755.000000 3755.000000\n",
"mean 2022.373635 1.906956e+05 137570.389880 46.271638\n",
"std 0.691448 6.716765e+05 63055.625278 48.589050\n",
"min 2020.000000 6.000000e+03 5132.000000 0.000000\n",
"25% 2022.000000 1.000000e+05 95000.000000 0.000000\n",
"50% 2022.000000 1.380000e+05 135000.000000 0.000000\n",
"75% 2023.000000 1.800000e+05 175000.000000 100.000000\n",
"max 2023.000000 3.040000e+07 450000.000000 100.000000"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"work_year 0\n",
"experience_level 0\n",
"employment_type 0\n",
"job_title 0\n",
"salary 0\n",
"salary_currency 0\n",
"salary_in_usd 0\n",
"employee_residence 0\n",
"remote_ratio 0\n",
"company_location 0\n",
"company_size 0\n",
"dtype: int64\n",
"work_year False\n",
"experience_level False\n",
"employment_type False\n",
"job_title False\n",
"salary False\n",
"salary_currency False\n",
"salary_in_usd False\n",
"employee_residence False\n",
"remote_ratio False\n",
"company_location False\n",
"company_size False\n",
"dtype: bool\n"
]
}
],
"source": [
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"print(df.isnull().any())"
]
},
2024-11-23 12:21:27 +04:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Классификация"
]
},
2024-11-23 12:17:48 +04:00
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": null,
2024-11-23 12:17:48 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" <th>above_median_salary</th>\n",
" <th>salary_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1809</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>182000</td>\n",
" <td>USD</td>\n",
" <td>182000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1082</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Machine Learning Engineer</td>\n",
" <td>126000</td>\n",
" <td>USD</td>\n",
" <td>126000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1686</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>BI Developer</td>\n",
" <td>140000</td>\n",
" <td>USD</td>\n",
" <td>140000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1600</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>140000</td>\n",
" <td>USD</td>\n",
" <td>140000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1376</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>226700</td>\n",
" <td>USD</td>\n",
" <td>226700</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2706</th>\n",
" <td>2022</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>160000</td>\n",
" <td>USD</td>\n",
" <td>160000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>928</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>200000</td>\n",
" <td>USD</td>\n",
" <td>200000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>564</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>140000</td>\n",
" <td>USD</td>\n",
" <td>140000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>716</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>297300</td>\n",
" <td>USD</td>\n",
" <td>297300</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>133832</td>\n",
" <td>USD</td>\n",
" <td>133832</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3004 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type job_title \\\n",
"1809 2023 SE FT Data Engineer \n",
"1082 2023 SE FT Machine Learning Engineer \n",
"1686 2023 SE FT BI Developer \n",
"1600 2023 SE FT Data Scientist \n",
"1376 2023 SE FT Data Engineer \n",
"... ... ... ... ... \n",
"2706 2022 SE FT Data Engineer \n",
"928 2023 MI FT Data Engineer \n",
"564 2023 MI FT Data Engineer \n",
"716 2023 SE FT Data Scientist \n",
"1299 2023 SE FT Data Engineer \n",
"\n",
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
"1809 182000 USD 182000 US 100 \n",
"1082 126000 USD 126000 US 0 \n",
"1686 140000 USD 140000 US 100 \n",
"1600 140000 USD 140000 US 0 \n",
"1376 226700 USD 226700 US 0 \n",
"... ... ... ... ... ... \n",
"2706 160000 USD 160000 US 100 \n",
"928 200000 USD 200000 US 0 \n",
"564 140000 USD 140000 US 0 \n",
"716 297300 USD 297300 US 100 \n",
"1299 133832 USD 133832 US 0 \n",
"\n",
" company_location company_size above_median_salary salary_category \n",
"1809 US M 1 1 \n",
"1082 US M 0 1 \n",
"1686 US M 1 1 \n",
"1600 US M 1 1 \n",
"1376 US M 1 2 \n",
"... ... ... ... ... \n",
"2706 US M 1 1 \n",
"928 US M 1 1 \n",
"564 US M 1 1 \n",
"716 US M 1 2 \n",
"1299 US M 0 1 \n",
"\n",
"[3004 rows x 13 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>above_median_salary</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1809</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1082</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1686</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1600</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1376</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2706</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>928</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>564</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>716</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3004 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" above_median_salary\n",
"1809 1\n",
"1082 0\n",
"1686 1\n",
"1600 1\n",
"1376 1\n",
"... ...\n",
"2706 1\n",
"928 1\n",
"564 1\n",
"716 1\n",
"1299 0\n",
"\n",
"[3004 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" <th>above_median_salary</th>\n",
" <th>salary_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3459</th>\n",
" <td>2022</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Research Scientist</td>\n",
" <td>59000</td>\n",
" <td>EUR</td>\n",
" <td>61989</td>\n",
" <td>AT</td>\n",
" <td>0</td>\n",
" <td>AT</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3724</th>\n",
" <td>2021</td>\n",
" <td>EN</td>\n",
" <td>FT</td>\n",
" <td>Business Data Analyst</td>\n",
" <td>50000</td>\n",
" <td>EUR</td>\n",
" <td>59102</td>\n",
" <td>LU</td>\n",
" <td>100</td>\n",
" <td>LU</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1795</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>180000</td>\n",
" <td>USD</td>\n",
" <td>180000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3535</th>\n",
" <td>2021</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>50000</td>\n",
" <td>USD</td>\n",
" <td>50000</td>\n",
" <td>NG</td>\n",
" <td>100</td>\n",
" <td>NG</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3255</th>\n",
" <td>2022</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Analyst</td>\n",
" <td>106260</td>\n",
" <td>USD</td>\n",
" <td>106260</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1943</th>\n",
" <td>2022</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>120000</td>\n",
" <td>USD</td>\n",
" <td>120000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>573</th>\n",
" <td>2023</td>\n",
" <td>EN</td>\n",
" <td>FT</td>\n",
" <td>Autonomous Vehicle Technician</td>\n",
" <td>7000</td>\n",
" <td>USD</td>\n",
" <td>7000</td>\n",
" <td>GH</td>\n",
" <td>0</td>\n",
" <td>GH</td>\n",
" <td>S</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3013</th>\n",
" <td>2022</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Machine Learning Engineer</td>\n",
" <td>129300</td>\n",
" <td>USD</td>\n",
" <td>129300</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>327</th>\n",
" <td>2023</td>\n",
" <td>EN</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>70000</td>\n",
" <td>CAD</td>\n",
" <td>51753</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1565</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Analyst</td>\n",
" <td>48000</td>\n",
" <td>EUR</td>\n",
" <td>51508</td>\n",
" <td>ES</td>\n",
" <td>0</td>\n",
" <td>ES</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>751 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type \\\n",
"3459 2022 MI FT \n",
"3724 2021 EN FT \n",
"1795 2023 SE FT \n",
"3535 2021 MI FT \n",
"3255 2022 MI FT \n",
"... ... ... ... \n",
"1943 2022 MI FT \n",
"573 2023 EN FT \n",
"3013 2022 SE FT \n",
"327 2023 EN FT \n",
"1565 2023 SE FT \n",
"\n",
" job_title salary salary_currency salary_in_usd \\\n",
"3459 Research Scientist 59000 EUR 61989 \n",
"3724 Business Data Analyst 50000 EUR 59102 \n",
"1795 Data Engineer 180000 USD 180000 \n",
"3535 Data Scientist 50000 USD 50000 \n",
"3255 Data Analyst 106260 USD 106260 \n",
"... ... ... ... ... \n",
"1943 Data Engineer 120000 USD 120000 \n",
"573 Autonomous Vehicle Technician 7000 USD 7000 \n",
"3013 Machine Learning Engineer 129300 USD 129300 \n",
"327 Data Scientist 70000 CAD 51753 \n",
"1565 Data Analyst 48000 EUR 51508 \n",
"\n",
" employee_residence remote_ratio company_location company_size \\\n",
"3459 AT 0 AT L \n",
"3724 LU 100 LU L \n",
"1795 US 0 US M \n",
"3535 NG 100 NG L \n",
"3255 US 0 US M \n",
"... ... ... ... ... \n",
"1943 US 100 US M \n",
"573 GH 0 GH S \n",
"3013 US 0 US M \n",
"327 CA 100 CA L \n",
"1565 ES 0 ES M \n",
"\n",
" above_median_salary salary_category \n",
"3459 0 0 \n",
"3724 0 0 \n",
"1795 1 1 \n",
"3535 0 0 \n",
"3255 0 1 \n",
"... ... ... \n",
"1943 0 1 \n",
"573 0 0 \n",
"3013 0 1 \n",
"327 0 0 \n",
"1565 0 0 \n",
"\n",
"[751 rows x 13 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>above_median_salary</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3459</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3724</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1795</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3535</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3255</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1943</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>573</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3013</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>327</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1565</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>751 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" above_median_salary\n",
"3459 0\n",
"3724 0\n",
"1795 1\n",
"3535 0\n",
"3255 0\n",
"... ...\n",
"1943 0\n",
"573 0\n",
"3013 0\n",
"327 0\n",
"1565 0\n",
"\n",
"[751 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"work_year int64\n",
"experience_level object\n",
"employment_type object\n",
"job_title object\n",
"salary int64\n",
"salary_currency object\n",
"salary_in_usd int64\n",
"employee_residence object\n",
"remote_ratio int64\n",
"company_location object\n",
"company_size object\n",
"above_median_salary int64\n",
"salary_category category\n",
"dtype: object\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACHyElEQVR4nOzdd3xV9eH/8dcdyc3emySMEEiYCgrGAQ4UETeto6DWtmottq4qddTZlqr91VWr9tsqttW6dxVFpiKCskdYYVwgZNzsndx7z++PwK2RYW5Icm6S9/PxuA+4555zz/sml3DfOed8PhbDMAxERERERESk3axmBxAREREREelpVKRERERERET8pCIlIiIiIiLiJxUpERERERERP6lIiYiIiIiI+ElFSkRERERExE8qUiIiIiIiIn5SkRIREREREfGTipSIiIiIiIifVKRERERERET8pCIlItJN5syZg8Vi8d1CQkIYMmQIN910E8XFxWbHExERET/YzQ4gItLXPPTQQwwcOJDGxka++OILnn32WT766CM2bNhAWFiY2fFERESkHVSkRES62ZQpUzjhhBMA+NnPfkZ8fDx//vOfee+997jyyitNTiciIiLtoVP7RERMduaZZwKwc+dOAMrLy/n1r3/NyJEjiYiIICoqiilTprB27dpDtm1sbOSBBx5gyJAhhISEkJqayqWXXkpBQQEAu3btanM64Xdvp59+uu+5Fi1ahMVi4bXXXuPuu+8mJSWF8PBwLrzwQvbs2XPIvpcvX865555LdHQ0YWFhTJw4kaVLlx72NZ5++umH3f8DDzxwyLr//ve/GTt2LKGhocTFxXHFFVccdv9He23f5vV6eeKJJxg+fDghISEkJydzww03UFFR0Wa9AQMGcP755x+yn5tuuumQ5zxc9scee+yQrylAU1MT999/P4MHD8bhcJCRkcGdd95JU1PTYb9W3/bzn/+c7OxswsLCiIuL48wzz+Tzzz9vs857773H1KlTSUtLw+FwkJWVxcMPP4zH42mz3umnn86IESNYuXIlJ598MqGhoQwcOJDnnnuuzXoH3wdHuv34xz8+JOeRvr9z5szxrfP555/zwx/+kMzMTN/X4dZbb6WhocG3zo9//OOj7ttisbBr167v/bqJiHQHHZESETHZwdITHx8PwI4dO3j33Xf54Q9/yMCBAykuLub5559n4sSJbNq0ibS0NAA8Hg/nn38+8+fP54orruDmm2+mpqaGefPmsWHDBrKysnz7uPLKKznvvPPa7Peuu+46bJ7f//73WCwWZs2aRUlJCU888QSTJk1izZo1hIaGArBgwQKmTJnC2LFjuf/++7Farbz44ou+D/rjxo075HnT09OZPXs2ALW1tdx4442H3fdvf/tbLrvsMn72s59RWlrK008/zYQJE1i9ejUxMTGHbHP99ddz2mmnAfD222/zzjvvtHn8hhtuYM6cOVx77bX86le/YufOnfzlL39h9erVLF26lKCgoMN+HfxRWVnpe23f5vV6ufDCC/niiy+4/vrryc3NZf369Tz++ONs3bqVd99996jP29zczIwZM0hPT6e8vJznn3+ec889l/z8fDIzM4HWa+8iIiK47bbbiIiIYMGCBdx3331UV1fz2GOPtXm+iooKzjvvPC677DKuvPJKXn/9dW688UaCg4P5yU9+0mbdX/3qV5x44oltlv3sZz87YtacnBzuueceAFwuF7feemubx9944w3q6+u58cYbiY+PZ8WKFTz99NPs3buXN954A2j9Xk2aNMm3zVVXXcUll1zCpZde6luWmJh41K+ZiEi3MUREpFu8+OKLBmB89tlnRmlpqbFnzx7j1VdfNeLj443Q0FBj7969hmEYRmNjo+HxeNpsu3PnTsPhcBgPPfSQb9kLL7xgAMaf//znQ/bl9Xp92wHGY489dsg6w4cPNyZOnOi7v3DhQgMw+vXrZ1RXV/uWv/766wZgPPnkk77nzs7ONiZPnuzbj2EYRn19vTFw4EDj7LPPPmRfJ598sjFixAjf/dLSUgMw7r//ft+yXbt2GTabzfj973/fZtv169cbdrv9kOXbtm0zAOOll17yLbv//vuNb//X9vnnnxuA8fLLL7fZdu7cuYcs79+/vzF16tRDss+cOdP47n+X381+5513GklJScbYsWPbfE3/9a9/GVar1fj888/bbP/cc88ZgLF06dJD9nc0K1asMADjzTff9C2rr68/ZL0bbrjBCAsLMxobG33LJk6caADG//t//8+3rKmpyTjuuOOMpKQko7m52TCM/70P3njjjUOeNzw83LjmmmsOWX7KKacYZ5xxhu/+wffdiy++eNScs2fPNiwWi7F79+7Dvt7vfp1FRAKJTu0TEelmkyZNIjExkYyMDK644goiIiJ455136NevHwAOhwOrtfXHs8fjoaysjIiICIYOHcqqVat8z/PWW2+RkJDAL3/5y0P28d1T0fxx9dVXExkZ6bv/gx/8gNTUVD766CMA1qxZw7Zt2/jRj35EWVkZLpcLl8tFXV0dZ511FkuWLMHr9bZ5zsbGRkJCQo6637fffhuv18tll13me06Xy0VKSgrZ2dksXLiwzfrNzc1A69frSN544w2io6M5++yz2zzn2LFjiYiIOOQ5W1pa2qzncrlobGw8au59+/bx9NNP89vf/paIiIhD9p+bm0tOTk6b5zx4Oud39384jY2NuFwu8vPzefLJJwkNDfVdYwf4jhIC1NTU4HK5OO2006ivr2fz5s1tnstut3PDDTf47gcHB3PDDTdQUlLCypUrvzfLkTQ3Nx/1+/DdnHV1dbhcLk4++WQMw2D16tUd3reIiFl0ap+ISDd75plnGDJkCHa7neTkZIYOHeorTtB6OtiTTz7JX//6V3bu3NnmWpeDp/9B6ymBQ4cOxW7v3B/l2dnZbe5bLBYGDx7suzZl27ZtAFxzzTVHfI6qqipiY2N9910u1yHP+13btm3DMIwjrvfdU/AqKysBDikv333OqqoqkpKSDvt4SUlJm/uffvqp36eO3X///aSlpXHDDTfw5ptvHrL//Pz8Iz7nd/d/OHPmzPGdBpmSksK8efPo37+/7/GNGzdy7733smDBAqqrq9tsW1VV1eZ+Wloa4eHhbZYNGTIEaL3m7KSTTvrePIdTWVnZJtPhOJ1O7rvvPt5///1Drk/7bk4RkZ5ARUpEpJuNGzeuzRGF7/rDH/7Ab3/7W37yk5/w8MMPExcXh9Vq5ZZbbjnkSI8ZDmZ47LHHOO644w67zrfLTXNzM/v37+fss8/+3ue1WCx8/PHH2Gy2oz4nQFFREdBaLo72nElJSbz88suHffy7BWf8+PH87ne/a7PsL3/5C++9995ht8/Pz2fOnDn8+9//Puy1Vl6vl5EjR/LnP//5sNtnZGQcMftBF1xwAYMHD6akpITnnnuOyy+/nC+++IIBAwZQWVnJxIkTiYqK4qGHHiIrK4uQkBBWrVrFrFmzuu39UlRUxOTJk4/4uMfj4eyzz6a8vJxZs2aRk5NDeHg4+/bt48c//nFAvK9FRPylIiUiEmDefPNNzjjjDP7xj3+0WV5ZWUlCQoLvflZWFsuXL6elpaVTBkw46OARp4MMw2D79u2MGjXKt1+AqKioNgMDHMnatWtpaWk5ank8+LyGYTBw4EDfUZKj2bRpExaLhaFDhx71OT/77DNOOeWUNqeWHUlCQsIhr+loA0LcddddHHfccVx++eVH3P/atWs566yzOny6Zb9+/XynfV566aUkJCTw7LPP8sgjj7Bo0SLKysp4++23mTBhgm+bgyNAfldhYSF1dXVtjkpt3boVaB21sCP27t1LTU0Nubm5R1xn/fr1bN26lZdeeomrr77at3zevHkd2qeISCDQNVIiIgHGZrNhGEabZW+88Qb79u1rs2zatGm4XC7+8pe/HPIc393eH//85z+pqanx3X/zzTfZv38/U6ZMAWDs2LFkZWXxpz/9idra2kO2Ly0tPSS7zWY77NDi33bppZdis9l48MEHD8lvGAZlZWW++263m7feeotx48Yd9dS+yy67DI/Hw8MPP3zIY26
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2wAAAIjCAYAAAB/FZhcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABu0UlEQVR4nO3deVxUZf//8feAsrgAIsqSgpA7SpqWkYVZlhalpbkkla2m0qatJkoa5m1lpgkudauZdFvWXWaaSVpShpnbTbm1SFIhqKFgoqAz5/dHP+brCCQoMAd4PR+PecSc65oz79lsPnNd5zoWwzAMAQAAAABMx8XZAQAAAAAApaNgAwAAAACTomADAAAAAJOiYAMAAAAAk6JgAwAAAACTomADAAAAAJOiYAMAAAAAk6JgAwAAAACTomADAAAAAJOiYAMAAAAAk6JgA1BjzZs3T3379pW/v7/q16+vgIAA9erVS0uWLJHNZnN2PAAAgAtmMQzDcHYIADgfkZGRCgwM1LXXXisvLy8dPXpUmzZt0rJlyzR06FD95z//cXZEAACAC0LBBqDGOnXqlOrXr19i+yOPPKI5c+YoIyNDrVq1qv5gAAAAlYQpkQBqrNKKNUn2Is3F5f/+iVuxYoWio6MVFBQkd3d3XXzxxXrhhRdktVodbnvNNdfIYrHYL35+foqOjtYPP/zg0M9isej555932Pbyyy/LYrHommuucdh+8uRJPf/882rbtq08PDwUGBiogQMH6pdffpEk/frrr7JYLFq8eLHD7WJjY2WxWHTPPffYty1evFgWi0Vubm46dOiQQ/+0tDR77i1btji0LV++XN26dZOnp6f8/Px055136o8//ijx3O3Zs0dDhgxRs2bN5OnpqXbt2mnChAmSpOeff97huSnt8uWXX9qfx06dOpXYf3mMGjVKbdq0UYMGDeTr66trr71WX331lUOfiryenTp10tatW3XllVfK09NToaGhmjdvnkO/L7/88h8f15mvwZn7Lq3vma/jV199pcGDBys4OFju7u5q2bKlxo4dqxMnTtj73HPPPed8Xn/99VdJf7+3b7755jKfu+LHUfw6/JPt27frxhtvlJeXlxo1aqTrrrtOmzZtsrcXv9f+6XL2e7bYvn37ZLFYNHPmzBJt33zzjSwWi30EvPh9Vfze8/LyUtOmTfXYY4/p5MmTDrc9ffq0XnjhBV188cVyd3dXq1at9Nxzz6mwsNChX6tWrewZXVxcFBAQoKFDhyozM9Pep/hz98orr5TI2KlTpxKf43379mnw4MEKCgqSi4uLff/leZ+fT+7SLmeyWCx6+OGHy7zP4tfvzPfOP+37zB+3XnnlFV155ZVq2rSpPD091a1bN73//vvnfJwAqkY9ZwcAgAt19OhRnT59WseOHdPWrVv1yiuvaNiwYQoODrb3Wbx4sRo1aqRx48apUaNGWr9+vSZNmqT8/Hy9/PLLDvtr3769JkyYIMMw9Msvv+jVV1/VTTfd5PBlr7QM06ZNK7HdarXq5ptv1rp16zRs2DA99thjOnbsmFJSUvTDDz/o4osvLnV/P//8s954440y78/V1VVLly7V2LFj7dsWLVokDw+PEl9yFy9erHvvvVeXXXaZpk2bppycHM2aNUsbN27U9u3b5ePjI0lKT0/X1Vdfrfr162vkyJFq1aqVfvnlF61cuVJTp07VwIED1bp1a/t+x44dqw4dOmjkyJH2bR06dCgzc3kVFRXpzjvvVIsWLZSbm6v58+erX79+2r17t/01rcjreeTIEd10000aMmSI7rjjDr333nsaPXq03NzcdN999zn0ffTRR3XZZZc5bHvggQfKzFr8XpGkw4cPO7we0t+FckFBgUaPHq2mTZtq8+bNev311/X7779r+fLlkqSHHnpIffr0sd/mrrvu0m233aaBAwfatzVr1qy8T1+57Ny5U1dffbW8vLz09NNPq379+po/f76uueYabdiwQT169FBUVJTefvtt+22mTp0qSfbHK0lXXnllqfsPCwtTz549lZycXOI5SU5OVuPGjTVgwACH7UOGDFGrVq00bdo0bdq0SbNnz9aRI0e0ZMkSe58HHnhAb731lm6//XY98cQT+vbbbzVt2jTt3r1bH374ocP+rr76ao0cOVI2m00//PCDXnvtNWVlZZUo/svDarWqf//+2r9/vx5//HG1bdtWFovF/pycS0Vyd+nSRU888YTDtiVLliglJaXCuc/02muv6a+//pIk7d69Wy+++KKee+45+2e2UaNG9r6zZs1S//79FRMTo6KiIi1btkyDBw/WJ598oujo6AvKAeA8GABQw7Vr186QZL/cfffdxqlTpxz6FBQUlLjdQw89ZDRo0MA4efKkfVuvXr2MXr16OfR77rnnDEnGwYMH7dskGfHx8fbrTz/9tNG8eXOjW7duDrdfuHChIcl49dVXS9y/zWYzDMMwMjIyDEnGokWL7G1DhgwxOnXqZLRs2dIYMWKEffuiRYsMScYdd9xhdO7c2b79+PHjhpeXlzF8+HBDkvHdd98ZhmEYRUVFRvPmzY1OnToZJ06csPf/5JNPDEnGpEmT7NuioqKMxo0bG/v37y8159lCQkIcsp2pV69eRnh4eKltFbV582ZDkvH+++/bt1Xk9ZRkzJgxw76tsLDQ6NKli9G8eXOjqKjIMAzD+OKLLwxJxvLly0vst2HDhqU+zp49exq9e/e2Xy/tdSwt57Rp0wyLxVLieS529nvrTCEhIUZ0dHSpbWc+ji+++KLMPoZhGLfeeqvh5uZm/PLLL/ZtWVlZRuPGjY2oqKhSb1PaZ+OfzJ8/35Bk7N69276tqKjI8PPzc3g+4+PjDUlG//79HW4/ZswYQ5Lxv//9zzAMw9ixY4chyXjggQcc+j355JOGJGP9+vX2baW9N4cPH240aNDAfr349Xr55ZdLZA8PD3d4rHv37jUkGdOmTXPoV573eUVzl/b6xsbGGmd/ZZNkxMbGlnm/xf9WZGRklGg71/vk7PdtUVGR0alTJ+Paa68t8/4AVB2mRAKo8RYtWqSUlBQlJyfr/vvvV3JyssOojyR5enra/z527JgOHz6sq6++WgUFBdqzZ49D31OnTunw4cM6dOiQ0tLS9OGHHyoiIkJ+fn6l3v8ff/yh119/XRMnTnT4lVqSPvjgA/n5+emRRx4pcbuzpzgV27p1q5YvX65p06Y5TOs801133aU9e/bYpz5+8MEH8vb21nXXXefQb8uWLTp48KDGjBkjDw8P+/bo6Gi1b99eq1atkiQdOnRIqampuu+++xxGJv8p57lYrVYdPnxYhw8fVlFRUYVue/LkSR0+fFi7d+/WrFmz5Onpqe7du9vbK/J61qtXTw899JD9upubmx566CEdPHhQW7duPa/HJv09Euju7v6Pfc7Mefz4cR0+fFhXXnmlDMPQ9u3bz+t+i9+ff/75p06fPl3h21utVq1du1a33nqrwsLC7NsDAwM1fPhwff3118rPzz+vbGcaMmSIPDw8lJycbN/22Wef6fDhw7rzzjtL9I+NjXW4XvyZWb16tcN/x40b59CveDSq+L1crLCwUIcPH9bBgweVkpKi9evXl/h8SFJBQYH9fVp8OXtq7bFjxyRJTZs2PfcDP0tFc1dE8efkzz//rNSVcc983x45ckR5eXm6+uqrtW3btkq7DwDlR8EGoMaLjIxUnz59NHz4cL355puaMmWKFi1apI0bN9r77Ny5U7fddpu8vb3l5eWlZs2a2b805uXlOezvm2++UbNmzdS8eXNdeeWVOn36tJYvX15m4RIfH6+goCCHoqDYL7/8onbt2qlevfLPQH/22Wd19dVX/+OxSs2aNVN0dLQWLlwoSVq4cKFGjBhRosDbv3+/JKldu3Yl9tG+fXt7+759+yTpvI87K82ePXvUrFkzh+Ph3nnnnXLddvHixWrWrJk6duyodevWKSUlRSEhIfb2iryeQUFBatiwocO2tm3bSpL9+J7zcfTo0RIF+tkyMzN1zz33yNfXV40aNVKzZs3Uq1evUnOW19q1a9WsWTP5+fnJw8NDl156qdauXVvu2x86dEg
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Создание целевого признака\n",
"median_salary = df['salary_in_usd'].median()\n",
"df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)\n",
"\n",
"# Разделение на признаки и целевую переменную\n",
"X = df.drop(columns=['salary_in_usd', 'above_median_salary'])\n",
"y = df['above_median_salary']\n",
"\n",
"# Примерная категоризация\n",
"df['salary_category'] = pd.cut(df['salary_in_usd'], bins=[0, 100000, 200000, np.inf], labels=[0, 1, 2])\n",
"\n",
"# Выбор признаков и целевых переменных\n",
"X = df.drop(columns=['salary_in_usd', 'salary_category'])\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
" \n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
" \n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" if frac_val <= 0:\n",
" assert len(df_input) == len(df_train) + len(df_temp)\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
"\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"above_median_salary\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)\n",
"\n",
"# Проверка преобразования\n",
"print(df.dtypes)\n",
"\n",
"# Визуализация распределения зарплат\n",
"plt.figure(figsize=(10, 6))\n",
"sns.histplot(df['salary_in_usd'], bins=50, kde=True)\n",
"plt.title('Распределение зарплат')\n",
"plt.xlabel('Зарплата (USD)')\n",
"plt.ylabel('Частота')\n",
"plt.show()\n",
"\n",
"# Визуализация зависимости между зарплатой и уровнем опыта\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x='experience_level', y='salary_in_usd', data=df)\n",
"plt.title('Зависимость зарплаты от уровня опыта')\n",
"plt.xlabel('Уровень опыта')\n",
"plt.ylabel('Зарплата (USD)')\n",
"plt.show()"
]
},
2024-11-23 12:21:27 +04:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь перейдем к делению на выборки и созданию ориентира"
]
},
2024-11-09 11:59:00 +04:00
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": 26,
2024-11-09 11:59:00 +04:00
"metadata": {},
"outputs": [
{
2024-11-23 12:21:27 +04:00
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (3004, 10)\n",
"Размер тестовой выборки: (751, 10)\n",
"Baseline Accuracy: 0.5126498002663116\n",
"Baseline F1 Score: 0.3474826991241725\n"
2024-11-09 11:59:00 +04:00
]
}
],
"source": [
"import pandas as pd\n",
2024-11-23 12:17:48 +04:00
"from sklearn.model_selection import train_test_split\n",
2024-11-23 12:21:27 +04:00
"from sklearn.metrics import accuracy_score, f1_score\n",
2024-11-23 12:17:48 +04:00
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Создание целевого признака\n",
"median_salary = df['salary_in_usd'].median()\n",
"df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)\n",
"\n",
"# Разделение на признаки и целевую переменную\n",
2024-11-23 12:21:27 +04:00
"features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']\n",
"target = 'above_median_salary'\n",
2024-11-23 12:17:48 +04:00
"\n",
"# Разделение данных на тренировочный и тестовый наборы\n",
2024-11-23 12:21:27 +04:00
"X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42, stratify=df[target])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Создание ориентира (baseline)\n",
"baseline_threshold = y_train.mean()\n",
"baseline_predictions = [1 if pred > baseline_threshold else 0 for pred in [baseline_threshold] * len(y_test)]\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Вычисление метрик для ориентира\n",
"baseline_accuracy = accuracy_score(y_test, baseline_predictions)\n",
"baseline_f1 = f1_score(y_test, baseline_predictions, average='weighted')\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print('Baseline Accuracy:', baseline_accuracy)\n",
"print('Baseline F1 Score:', baseline_f1)"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-23 12:21:27 +04:00
"Создание конвейера и обучение моделей"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": null,
2024-11-23 12:17:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-23 12:21:27 +04:00
"Model: Logistic Regression\n",
"Accuracy: 0.7523\n",
"F1 Score: 0.7609\n",
"----------------------------------------\n",
"Model: Decision Tree\n",
"Accuracy: 0.9960\n",
"F1 Score: 0.9959\n",
"----------------------------------------\n",
"Model: Gradient Boosting\n",
"Accuracy: 0.9947\n",
"F1 Score: 0.9945\n",
"----------------------------------------\n"
2024-11-23 12:17:48 +04:00
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
2024-11-23 12:21:27 +04:00
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.metrics import accuracy_score, f1_score\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Загрузка данных\n",
2024-11-23 12:17:48 +04:00
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
2024-11-23 12:21:27 +04:00
"# Создание целевого признака\n",
"median_salary = df['salary_in_usd'].median()\n",
"df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Разделение на признаки и целевую переменную\n",
"X = df.drop(columns=['salary_in_usd', 'above_median_salary'])\n",
"y = df['above_median_salary']\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Разделение данных на тренировочный и тестовый наборы\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Определение столбцов\n",
"numeric_columns = [\"work_year\", \"salary\", \"remote_ratio\"]\n",
"cat_columns = [\"experience_level\", \"employment_type\", \"job_title\", \"salary_currency\", \"employee_residence\", \"company_location\", \"company_size\"]\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Предобработка данных\n",
2024-11-23 12:17:48 +04:00
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
2024-11-23 12:21:27 +04:00
" ('num', StandardScaler(), numeric_columns),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Создание конвейеров для моделей\n",
"pipeline_logistic_regression = Pipeline(steps=[\n",
2024-11-23 12:17:48 +04:00
" ('preprocessor', preprocessor),\n",
2024-11-23 12:21:27 +04:00
" ('classifier', LogisticRegression(random_state=42))])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"pipeline_decision_tree = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', DecisionTreeClassifier(random_state=42))])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"pipeline_gradient_boosting = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', GradientBoostingClassifier(random_state=42))])\n",
"\n",
"# Список конвейеров \n",
"pipelines = [\n",
" ('Logistic Regression', pipeline_logistic_regression),\n",
" ('Decision Tree', pipeline_decision_tree),\n",
" ('Gradient Boosting', pipeline_gradient_boosting)\n",
"]\n",
"\n",
"# Обучение моделей и вывод результатов\n",
"for name, pipeline in pipelines:\n",
" pipeline.fit(X_train, y_train)\n",
" y_pred = pipeline.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" f1 = f1_score(y_test, y_pred)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Accuracy: {accuracy:.4f}\")\n",
" print(f\"F1 Score: {f1:.4f}\")\n",
" print(\"-\" * 40)"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-23 12:21:27 +04:00
"Оценка качества моделей"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": 27,
2024-11-23 12:17:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-23 12:21:27 +04:00
"Model: Logistic Regression\n",
"Accuracy: 0.7523302263648469\n",
"F1 Score: 0.7517841210039291\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"Model: Decision Tree\n",
"Accuracy: 0.996005326231691\n",
"F1 Score: 0.9960048583691977\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"Model: Gradient Boosting\n",
"Accuracy: 0.9946737683089214\n",
"F1 Score: 0.9946728986768623\n",
"\n"
2024-11-23 12:17:48 +04:00
]
}
],
"source": [
2024-11-23 12:21:27 +04:00
"from sklearn.metrics import accuracy_score, f1_score\n",
"\n",
"for name, pipeline in pipelines:\n",
" y_pred = pipeline.predict(X_test)\n",
" print(f\"Model: {name}\")\n",
" print('Accuracy:', accuracy_score(y_test, y_pred))\n",
" print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))\n",
" print()"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-23 12:21:27 +04:00
"Регрессия\n",
"Цель: Разработать модель регрессии, которая будет предсказывать зарплату (salary_in_usd) на основе демографических данных, типа работы и других факторов."
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": 28,
2024-11-23 12:17:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-23 12:21:27 +04:00
"Размер данных до удаления выбросов: (3755, 11)\n",
"Размер данных после удаления выбросов: (3708, 11)\n"
2024-11-23 12:17:48 +04:00
]
}
],
"source": [
"import pandas as pd\n",
2024-11-23 12:21:27 +04:00
"from scipy import stats\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Загрузка данных\n",
2024-11-23 12:17:48 +04:00
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
2024-11-23 12:21:27 +04:00
"# Определение числовых признаков\n",
"numeric_features = ['work_year', 'salary', 'salary_in_usd', 'remote_ratio']\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Вычисление z-оценок для числовых признаков\n",
"z_scores = stats.zscore(df[numeric_features])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Определение порога для удаления выбросов\n",
"threshold = 3\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Удаление выбросов\n",
"df_cleaned = df[(z_scores < threshold).all(axis=1)]\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print(\"Размер данных до удаления выбросов:\", df.shape)\n",
"print(\"Размер данных после удаления выбросов:\", df_cleaned.shape)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (2966, 9)\n",
"Размер тестовой выборки: (742, 9)\n",
"Baseline MAE: 48988.97819674187\n",
"Baseline MSE: 3791583837.2779293\n",
"Baseline R²: -0.005051587587466155\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Определение признаков и целевой переменной\n",
"features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']\n",
"target = 'salary_in_usd'\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Разделение данных на тренировочный и тестовый наборы\n",
"X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Создание ориентира (baseline)\n",
"baseline_predictions = [y_train.mean()] * len(y_test)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Вычисление метрик для ориентира\n",
"print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))\n",
"print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))\n",
"print('Baseline R²:', r2_score(y_test, baseline_predictions))"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": 31,
2024-11-23 12:17:48 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-23 12:21:27 +04:00
"Размер данных до удаления выбросов: (3755, 11)\n",
"Размер данных после удаления выбросов: (3733, 11)\n",
"Размер обучающей выборки: (2986, 9)\n",
"Размер тестовой выборки: (747, 9)\n",
"Baseline MAE: 47593.92288600708\n",
"Baseline MSE: 3680965527.9964128\n",
"Baseline R²: -0.0016576422593919116\n",
"Model: Linear Regression trained.\n",
"Model: Decision Tree trained.\n",
"Model: Gradient Boosting trained.\n",
"Model: Linear Regression\n",
"MAE: 36617.65439873256\n",
"MSE: 2194684192.4416404\n",
"R²: 0.4027865306031213\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"Model: Decision Tree\n",
"MAE: 36516.71804922624\n",
"MSE: 2246643776.062331\n",
"R²: 0.38864738324451775\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"Model: Gradient Boosting\n",
"MAE: 35842.80843437428\n",
"MSE: 2125285552.2470944\n",
"R²: 0.42167116230764956\n",
"\n"
2024-11-23 12:17:48 +04:00
]
}
],
"source": [
"import pandas as pd\n",
2024-11-23 12:21:27 +04:00
"from scipy import stats\n",
2024-11-23 12:17:48 +04:00
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
2024-11-23 12:21:27 +04:00
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Загрузка данных\n",
2024-11-23 12:17:48 +04:00
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
2024-11-23 12:21:27 +04:00
"# Определение числовых признаков\n",
2024-11-23 12:17:48 +04:00
"numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']\n",
"\n",
2024-11-23 12:21:27 +04:00
"# Вычисление z-оценок для числовых признаков\n",
"z_scores = stats.zscore(df[numeric_features])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Определение порога для удаления выбросов\n",
"threshold = 3\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Удаление выбросов\n",
"df_cleaned = df[(z_scores < threshold).all(axis=1)]\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print(\"Размер данных до удаления выбросов:\", df.shape)\n",
"print(\"Размер данных после удаления выбросов:\", df_cleaned.shape)\n",
"\n",
"# Разделение на выборки и создание ориентира\n",
"features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']\n",
"target = 'salary_in_usd'\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Создание ориентира (baseline)\n",
"baseline_predictions = [y_train.mean()] * len(y_test)\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))\n",
"print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))\n",
"print('Baseline R²:', r2_score(y_test, baseline_predictions))\n",
"\n",
"# Создание конвейера и обучение моделей\n",
"categorical_features = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']\n",
"numeric_features = ['work_year', 'remote_ratio']\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numeric_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n",
"\n",
"pipeline_linear_regression = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('regressor', LinearRegression())])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"pipeline_decision_tree = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('regressor', DecisionTreeRegressor(random_state=42))])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"pipeline_gradient_boosting = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('regressor', GradientBoostingRegressor(random_state=42))])\n",
"\n",
"pipelines = [\n",
" ('Linear Regression', pipeline_linear_regression),\n",
" ('Decision Tree', pipeline_decision_tree),\n",
" ('Gradient Boosting', pipeline_gradient_boosting)\n",
"]\n",
"\n",
"for name, pipeline in pipelines:\n",
" pipeline.fit(X_train, y_train)\n",
" print(f\"Model: {name} trained.\")\n",
"\n",
"# Оценка качества моделей\n",
"for name, pipeline in pipelines:\n",
" y_pred = pipeline.predict(X_test)\n",
" print(f\"Model: {name}\")\n",
" print('MAE:', mean_absolute_error(y_test, y_pred))\n",
" print('MSE:', mean_squared_error(y_test, y_pred))\n",
" print('R²:', r2_score(y_test, y_pred))\n",
" print()"
2024-11-23 12:17:48 +04:00
]
},
{
2024-11-23 12:21:27 +04:00
"cell_type": "code",
"execution_count": 32,
2024-11-23 12:17:48 +04:00
"metadata": {},
2024-11-23 12:21:27 +04:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Linear Regression\n",
"MAE: 36617.65439873256\n",
"MSE: 2194684192.4416404\n",
"R²: 0.4027865306031213\n",
"\n",
"Model: Decision Tree\n",
"MAE: 36516.71804922624\n",
"MSE: 2246643776.062331\n",
"R²: 0.38864738324451775\n",
"\n",
"Model: Gradient Boosting\n",
"MAE: 35842.80843437428\n",
"MSE: 2125285552.2470944\n",
"R²: 0.42167116230764956\n",
"\n"
]
}
],
2024-11-23 12:17:48 +04:00
"source": [
2024-11-23 12:21:27 +04:00
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"for name, pipeline in pipelines:\n",
" y_pred = pipeline.predict(X_test)\n",
" print(f\"Model: {name}\")\n",
" print('MAE:', mean_absolute_error(y_test, y_pred))\n",
" print('MSE:', mean_squared_error(y_test, y_pred))\n",
" print('R²:', r2_score(y_test, y_pred))\n",
" print()"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": 43,
2024-11-23 12:17:48 +04:00
"metadata": {},
2024-11-23 12:21:27 +04:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения:\n",
" work_year 0\n",
"experience_level 0\n",
"employment_type 0\n",
"job_title 0\n",
"salary 0\n",
"salary_currency 0\n",
"salary_in_usd 0\n",
"employee_residence 0\n",
"remote_ratio 0\n",
"company_location 0\n",
"company_size 0\n",
"dtype: int64\n"
]
}
],
2024-11-23 12:17:48 +04:00
"source": [
"import pandas as pd\n",
2024-11-23 12:21:27 +04:00
"from sklearn.model_selection import train_test_split\n",
2024-11-23 12:17:48 +04:00
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
2024-11-23 12:21:27 +04:00
"from sklearn.compose import ColumnTransformer\n",
2024-11-23 12:17:48 +04:00
"from sklearn.pipeline import Pipeline\n",
2024-11-23 12:21:27 +04:00
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"from scipy.stats import uniform, randint\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Проверка на пропущенные значения\n",
"print(\"Пропущенные значения:\\n\", df.isnull().sum())\n",
"\n",
"# Удаление строк с пропущенными значениями\n",
"df = df.dropna()\n",
"\n",
"# Выбор признаков и целевой переменной\n",
"features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location', 'company_size']\n",
"target = 'salary_in_usd'\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Определение категориальных и числовых признаков\n",
"categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n",
"numeric_features = ['work_year', 'remote_ratio']\n",
"\n",
"# Создание пайплайна для обработки данных\n",
"categorical_transformer = Pipeline(steps=[\n",
2024-11-23 12:17:48 +04:00
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
"])\n",
"\n",
2024-11-23 12:21:27 +04:00
"numeric_transformer = Pipeline(steps=[\n",
" ('scaler', StandardScaler())\n",
"])\n",
"\n",
"preprocessor = ColumnTransformer(\n",
2024-11-23 12:17:48 +04:00
" transformers=[\n",
2024-11-23 12:21:27 +04:00
" ('num', numeric_transformer, numeric_features),\n",
" ('cat', categorical_transformer, categorical_features)\n",
" ])\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Преобразование данных\n",
"X = preprocessor.fit_transform(df[features])\n",
"y = df[target]\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
2024-11-23 12:17:48 +04:00
]
},
{
"cell_type": "code",
2024-11-23 12:21:27 +04:00
"execution_count": 47,
2024-11-23 12:17:48 +04:00
"metadata": {},
"outputs": [
{
2024-11-23 12:21:27 +04:00
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 4 is smaller than n_iter=10. Running 4 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n",
"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:540: FitFailedWarning: \n",
"6 fits failed out of a total of 12.\n",
"The score on these train-test partitions for these parameters will be set to nan.\n",
"If these failures are not expected, you can try to debug them by setting error_score='raise'.\n",
"\n",
"Below are more details about the failures:\n",
"--------------------------------------------------------------------------------\n",
"6 fits failed with the following error:\n",
"Traceback (most recent call last):\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n",
" estimator.fit(X_train, y_train, **fit_params)\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n",
" return fit_method(estimator, *args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py\", line 473, in fit\n",
" self._final_estimator.fit(Xt, y, **last_step_params[\"fit\"])\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n",
" return fit_method(estimator, *args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\linear_model\\_base.py\", line 609, in fit\n",
" X, y = self._validate_data(\n",
" ^^^^^^^^^^^^^^^^^^^^\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py\", line 650, in _validate_data\n",
" X, y = check_X_y(X, y, **check_params)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py\", line 1301, in check_X_y\n",
" X = check_array(\n",
" ^^^^^^^^^^^^\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py\", line 971, in check_array\n",
" array = _ensure_sparse_format(\n",
" ^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py\", line 595, in _ensure_sparse_format\n",
" raise TypeError(\n",
"TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.\n",
"\n",
" warnings.warn(some_fits_failed_message, FitFailedWarning)\n",
"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan 0.37308723 nan 0.37316524]\n",
" warnings.warn(\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_14908\\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n",
" axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_14908\\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n",
" axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_14908\\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n",
" axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n"
2024-11-23 12:17:48 +04:00
]
2024-11-23 12:21:27 +04:00
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA90AAAXRCAYAAACD3P7HAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXhMZ//H8c8kkUVIYskixK61VSm1pLWUtEFQtS+1F21pKY9dCYrS9qmqoh5qrbW0RVF7tcRetW8VS2liTYKQRHJ+f/hlamSRqGO0eb+ua652ztznzPecnMh85tz3uS2GYRgCAAAAAACPnIO9CwAAAAAA4N+K0A0AAAAAgEkI3QAAAAAAmITQDQAAAACASQjdAAAAAACYhNANAAAAAIBJCN0AAAAAAJiE0A0AAAAAgEkI3QAAAAAAmITQDQAAkEGFCxdWx44d7V0G/oZZs2bJYrHo9OnT9i4FQBZB6AaQpSR/2ErtUbZsWXuXB8BOTp8+LYvFoo8//tjepZhm8+bNNv/mOTo6ysfHR82aNdORI0fsXR4A/Gs52bsAALCHkSNHqkiRItbno0ePtmM1AP4pjh07JgeHf/Y1i3fffVfPP/+8EhIStH//fk2dOlWbN2/WwYMH5efnZ+/yTNeuXTu1atVKLi4u9i4FQBZB6AaQpRiGIUmqX7++KlasaF0+ffp0Xb582V5lAfiHeNKD2s2bN+Xu7p5um+rVq6tZs2bW508//bTeeustzZkzR/379ze7RBuxsbHKnj37Y31PR0dHOTo6Ptb3BJC1/bO/qgWATEpISJCkB37gqlmzpp599tlUX3v66acVHBwsKe2xgbVq1VKtWrWsz5O7dX7zzTc27XLkyJFifOjrr7+uvHnzWp8nd3udNWuWTbsePXrIYrGkWD8qKkq9e/dWQECAXFxcVLx4cY0bN05JSUnp7rN0d7xqal3v33jjjUzXkl5XfovFotDQUJttnD9/Xp07d5avr69cXFxUpkwZffXVVzZtko/jokWLNHjwYPn5+cnd3V2NGjXSuXPnbNrWqlUr3fe/92f2/fffKyQkRP7+/nJxcVGxYsU0atQoJSYmpthm2bJltWfPHgUGBsrNzU1FihTR1KlTbdrFx8dr2LBhqlixojw9PeXu7q7q1atr06ZNNu2Sj6fFYtF3331n89rt27eVK1euVLs8P+hY3d+NOL3jHxoaKovFoqNHj6pFixby8PBQnjx51KtXL92+fdvmfS0Wi3r27Km0ZHasbFrn2/3n17113v+4//zftGmTqlevbj12yY/06s6M+8d0J+/z1q1b1adPH3l7e8vd3V2vvfaaLl26lGL91atXq3r16nJ3d1fOnDkVEhKiQ4cO2bTZv3+/OnbsqKJFi8rV1VV+fn7q3Lmzrly5kuoxOXz4sNq0aaNcuXLpxRdfzPQ+Va9eXZL0+++/2yzPyO+kJJ05c0aNGjWSu7u7fHx89N577+nHH3+UxWLR5s2bre3u/f2pUaOGsmfPrsGDB0uS4uLiNHz4cBUvXlwuLi4KCAhQ//79FRcXZ/Ne69at04svvigvLy/lyJFDTz/9tHUbyT7//HOVKVNG2bNnV65cuVSpUiXNnz/f+npa5+nkyZNVpkwZubi4yN/fXz169FBUVJRNm+R9OHz4sF566SVlz55d+fPn1/jx4zN0rAFkTVzpBpClxMfHS5KcnZ3TbdeuXTt17dpVBw8etBnrvWvXLh0/flxDhw41tc4HOXnypP73v/+lWB4bG6uaNWvq/Pnz6t69uwoWLKht27Zp0KBB+vPPPzVhwoQHbrt8+fLq27evzbLixYtnupZk93flv3Hjht566y2bNpGRkapatao1HHl7e2v16tXq0qWLYmJi1Lt3b5v2o0ePlsVi0YABA3Tx4kVNmDBBQUFB2rdvn9zc3KztChQooLFjx9qsu2rVKi1YsMBm2axZs5QjRw716dNHOXLk0MaNGzVs2DDFxMToo48+sml77do11a9fXy1atFDr1q21ePFivfXWW3J2dlbnzp0lSTExMZo+fbpat26trl276vr165oxY4aCg4O1c+dOlS9f3mabrq6umjlzpho3bmxdtmzZshShN6PHqlSpUpo7d651nWnTpunIkSP69NNPrcvKlStns90WLVqocOHCGjt2rLZv366JEyfq2rVrmjNnTooaHqV7z7fw8HANGzYs3fb37td7771n81p4eLhCQkKUL18+DRs2TN7e3pLu/j6b7Z133lGuXLk0fPhwnT59WhMmTFDPnj21aNEim9o7dOig4OBgjRs3TrGxsZoyZYpefPFF/frrrypcuLCku8Hy1KlT6tSpk/z8/HTo0CFNmzZNhw4d0vbt22WxWGzeu3nz5ipRooTGjBlj7c2TGcnhM1euXNZlGf2dvHnzpmrXrq0///xTvXr1kp+fn+bPn5/iC6ZkV65cUb169dSqVSu9/vrr8vX1VVJSkho1aqRffvlF3bp1U6lSpXTgwAF9+umnOn78uPULqUOHDqlBgwYqV66cRo4cKRcXF508eVJbt261bv9///uf3n33XTVr1sz6xdH+/fu1Y8cOtWnTJs1jEBoaqhEjRigoKEhvvfWWjh07pilTpmjXrl3aunWrsmXLZm177do11a1bV02aNFGLFi30zTffaMCAAXrmmWdUr169TB9/AFmAAQBZyAcffGBIMk6dOmWzvGbNmkaZMmWsz6OiogxXV1djwIABNu3effddw93d3bhx44ZhGIYxe/bsNLdXs2ZN6/NNmzYZkowlS5bYtHN3dzc6dOhgs6xt27ZGnjx5rM/Dw8MNScbMmTOty1q0aGGULVvWCAgIsFl/1KhRhru7u3H8+HGbbQ4cONBwdHQ0zp49m/qB+X+FChUyQkJC0nw9M7XMnDnTkGTs2rXLZhuXLl0yJBnDhw+3LuvSpYuRL18+4/LlyzZtW7VqZXh6ehqxsbGGYfx1HPPnz2/ExMRY2y1evNiQZHz22WfWZff/TJN99NFHhiQjPDzcuix5+/fq3r27kT17duP27ds225RkfPLJJ9ZlcXFxRvny5Q0fHx8jPj7eMAzDuHPnjhEXF2ezvWvXrhm+vr5G586drcuSj2fr1q0NJycnIyIiwvpanTp1jDZt2hiSjI8++ijTx+peHTp0MAoVKpRiuWEYxvDhww1JRqNGjWyWv/3224Yk47fffrMuk2T06NEj1e0Yxl8/83uPbXr8/f2NBg0aWJ/v2rUrxfmVbMiQIYbFYrFZVqhQIZtz7ssvvzQkGWFhYTbtHlS3Yfz1s7j3WKfm/vdM3uegoCAjKSnJuvy9994zHB0djaioKMMwDOP69euGl5eX0bVrV5vtRUREGJ6enjbLU/sZLliwwJBkbNmyxbos+WfXunXrdGtOlvz789VXXxmXLl0yLly4YKxZs8YoXry4YbFYjJ07d1rbZvQ8++STTwxJxnfffWdtc+vWLaNkyZKGJGPTpk3W5cm/P1OnTrXZ5ty5cw0HBwfj559/tlk+depUQ5KxdetWwzAM49NPPzUkGZcuXUpzH1999dVUf+/vdf95evHiRcPZ2dl45ZVXjMTERGu7SZMmWY/X/fswZ84c67K4uDjDz8/PaNq0abrvCyDrons5gCzl2rVrkqQ8efKk287T01OvvvqqFixYYL1ylJiYqEWLFqlx48bWMZM+Pj6SpD/++MPEqm3t2bNHS5Ys0dixY1Pc0GnJkiXWrrWXL1+2PoKCgpSYmKgtW7Y8tloyyjAMLV26VA0bNpRhGDZ1BwcHKzo6Wnv37rVZp3379sqZM6f1ebNmzZQvXz6tWrXqoWq49+r49evXdfnyZVWvXl2xsbE6evSoTVsnJyd1797d+tzZ2Vndu3fXxYsXtWfPHkl3hy8k96ZISkrS1atXdefOHVWqVCnFvkjSc889pzJlyliv4p45c0abNm1K0XX6YY5VRvXo0cPm+TvvvCNJKY7p7du3dfnyZV25ciVDQxbSc/v2bbm6umaobXx8/APHU1+/fl3Sg3+/zdCtWzebK9DVq1dXYmKizpw5I+nu1euoqCi1bt3a5ufm6OioKlWq2FwZvvd8TD7
"text/plain": [
"<Figure size 1000x1500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
2024-11-23 12:17:48 +04:00
}
],
"source": [
2024-11-23 12:21:27 +04:00
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
"from scipy.stats import uniform, randint\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
2024-11-23 12:17:48 +04:00
"\n",
2024-11-23 12:21:27 +04:00
"# ... (ваш код предобработки данных, как в предыдущем примере) ...\n",
"\n",
"# Определение распределений для гиперпараметров\n",
"param_distributions = {\n",
" 'Linear Regression': {\n",
" 'regressor__fit_intercept': [True, False],\n",
" 'regressor__positive': [True, False]\n",
" },\n",
" 'Random Forest': {\n",
" 'regressor__n_estimators': randint(50, 200),\n",
" 'regressor__max_depth': [None, 10, 20],\n",
" 'regressor__min_samples_split': randint(2, 11),\n",
" 'regressor__min_samples_leaf': randint(1, 5),\n",
" 'regressor__bootstrap': [True, False]\n",
" },\n",
" 'Gradient Boosting': {\n",
" 'regressor__n_estimators': randint(50, 200),\n",
" 'regressor__learning_rate': uniform(0.01, 0.49), # uniform distribution for learning rate\n",
" 'regressor__max_depth': [3, 5, 7],\n",
" 'regressor__min_samples_split': randint(2, 11),\n",
" 'regressor__min_samples_leaf': randint(1, 5),\n",
" 'regressor__subsample': uniform(0.5, 0.5) # uniform distribution for subsample\n",
"\n",
" }\n",
"}\n",
"\n",
"# Словарь для хранения лучших моделей и их гиперпараметров\n",
"best_models = {}\n",
"\n",
"# Цикл для обучения и настройки гиперпараметров каждой модели\n",
"for model_name, model_params in param_distributions.items():\n",
" if model_name == 'Linear Regression':\n",
" model = LinearRegression()\n",
" elif model_name == 'Random Forest':\n",
" model = RandomForestRegressor(random_state=42)\n",
" elif model_name == 'Gradient Boosting':\n",
" model = GradientBoostingRegressor(random_state=42)\n",
" else:\n",
" continue #Обработка неизвестных моделей\n",
"\n",
" pipeline = Pipeline([('regressor', model)])\n",
" random_search = RandomizedSearchCV(pipeline, param_distributions=model_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)\n",
" random_search.fit(X_train, y_train)\n",
" best_models[model_name] = random_search.best_params_\n",
"\n",
"\n",
"# Визуализация лучших гиперпараметров\n",
"\n",
"fig, axes = plt.subplots(len(best_models), 1, figsize=(10, 5 * len(best_models)))\n",
"if len(best_models) == 1:\n",
" axes = [axes] # обработка случая с одной моделью\n",
"\n",
"for i, (model_name, params) in enumerate(best_models.items()):\n",
" axes[i].bar(params.keys(), params.values())\n",
" axes[i].set_title(f\"Лучшие гиперпараметры для {model_name}\")\n",
" axes[i].set_xticklabels(params.keys(), rotation=45, ha=\"right\") #Поворачиваем подписи на оси х\n",
" axes[i].tick_params(axis='x', which='major', labelsize=8) # Размер шрифта подписей оси х\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
2024-11-09 11:59:00 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}