2024-11-09 11:59:00 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Начало лабораторной работы**"
]
},
2024-11-23 12:17:48 +04:00
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['work_year', 'experience_level', 'employment_type', 'job_title',\n",
" 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',\n",
" 'remote_ratio', 'company_location', 'company_size'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Principal Data Scientist</td>\n",
" <td>80000</td>\n",
" <td>EUR</td>\n",
" <td>85847</td>\n",
" <td>ES</td>\n",
" <td>100</td>\n",
" <td>ES</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>CT</td>\n",
" <td>ML Engineer</td>\n",
" <td>30000</td>\n",
" <td>USD</td>\n",
" <td>30000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>CT</td>\n",
" <td>ML Engineer</td>\n",
" <td>25500</td>\n",
" <td>USD</td>\n",
" <td>25500</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>175000</td>\n",
" <td>USD</td>\n",
" <td>175000</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>120000</td>\n",
" <td>USD</td>\n",
" <td>120000</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>M</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type job_title \\\n",
"0 2023 SE FT Principal Data Scientist \n",
"1 2023 MI CT ML Engineer \n",
"2 2023 MI CT ML Engineer \n",
"3 2023 SE FT Data Scientist \n",
"4 2023 SE FT Data Scientist \n",
"\n",
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
"0 80000 EUR 85847 ES 100 \n",
"1 30000 USD 30000 US 100 \n",
"2 25500 USD 25500 US 100 \n",
"3 175000 USD 175000 CA 100 \n",
"4 120000 USD 120000 CA 100 \n",
"\n",
" company_location company_size \n",
"0 ES L \n",
"1 US S \n",
"2 US S \n",
"3 CA M \n",
"4 CA M "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>salary</th>\n",
" <th>salary_in_usd</th>\n",
" <th>remote_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>3755.000000</td>\n",
" <td>3.755000e+03</td>\n",
" <td>3755.000000</td>\n",
" <td>3755.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>2022.373635</td>\n",
" <td>1.906956e+05</td>\n",
" <td>137570.389880</td>\n",
" <td>46.271638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.691448</td>\n",
" <td>6.716765e+05</td>\n",
" <td>63055.625278</td>\n",
" <td>48.589050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2020.000000</td>\n",
" <td>6.000000e+03</td>\n",
" <td>5132.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2022.000000</td>\n",
" <td>1.000000e+05</td>\n",
" <td>95000.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>2022.000000</td>\n",
" <td>1.380000e+05</td>\n",
" <td>135000.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2023.000000</td>\n",
" <td>1.800000e+05</td>\n",
" <td>175000.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>2023.000000</td>\n",
" <td>3.040000e+07</td>\n",
" <td>450000.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" work_year salary salary_in_usd remote_ratio\n",
"count 3755.000000 3.755000e+03 3755.000000 3755.000000\n",
"mean 2022.373635 1.906956e+05 137570.389880 46.271638\n",
"std 0.691448 6.716765e+05 63055.625278 48.589050\n",
"min 2020.000000 6.000000e+03 5132.000000 0.000000\n",
"25% 2022.000000 1.000000e+05 95000.000000 0.000000\n",
"50% 2022.000000 1.380000e+05 135000.000000 0.000000\n",
"75% 2023.000000 1.800000e+05 175000.000000 100.000000\n",
"max 2023.000000 3.040000e+07 450000.000000 100.000000"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"work_year 0\n",
"experience_level 0\n",
"employment_type 0\n",
"job_title 0\n",
"salary 0\n",
"salary_currency 0\n",
"salary_in_usd 0\n",
"employee_residence 0\n",
"remote_ratio 0\n",
"company_location 0\n",
"company_size 0\n",
"dtype: int64\n",
"work_year False\n",
"experience_level False\n",
"employment_type False\n",
"job_title False\n",
"salary False\n",
"salary_currency False\n",
"salary_in_usd False\n",
"employee_residence False\n",
"remote_ratio False\n",
"company_location False\n",
"company_size False\n",
"dtype: bool\n"
]
}
],
"source": [
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"print(df.isnull().any())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" <th>above_median_salary</th>\n",
" <th>salary_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1809</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>182000</td>\n",
" <td>USD</td>\n",
" <td>182000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1082</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Machine Learning Engineer</td>\n",
" <td>126000</td>\n",
" <td>USD</td>\n",
" <td>126000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1686</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>BI Developer</td>\n",
" <td>140000</td>\n",
" <td>USD</td>\n",
" <td>140000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1600</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>140000</td>\n",
" <td>USD</td>\n",
" <td>140000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1376</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>226700</td>\n",
" <td>USD</td>\n",
" <td>226700</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2706</th>\n",
" <td>2022</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>160000</td>\n",
" <td>USD</td>\n",
" <td>160000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>928</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>200000</td>\n",
" <td>USD</td>\n",
" <td>200000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>564</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>140000</td>\n",
" <td>USD</td>\n",
" <td>140000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>716</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>297300</td>\n",
" <td>USD</td>\n",
" <td>297300</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>133832</td>\n",
" <td>USD</td>\n",
" <td>133832</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3004 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type job_title \\\n",
"1809 2023 SE FT Data Engineer \n",
"1082 2023 SE FT Machine Learning Engineer \n",
"1686 2023 SE FT BI Developer \n",
"1600 2023 SE FT Data Scientist \n",
"1376 2023 SE FT Data Engineer \n",
"... ... ... ... ... \n",
"2706 2022 SE FT Data Engineer \n",
"928 2023 MI FT Data Engineer \n",
"564 2023 MI FT Data Engineer \n",
"716 2023 SE FT Data Scientist \n",
"1299 2023 SE FT Data Engineer \n",
"\n",
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
"1809 182000 USD 182000 US 100 \n",
"1082 126000 USD 126000 US 0 \n",
"1686 140000 USD 140000 US 100 \n",
"1600 140000 USD 140000 US 0 \n",
"1376 226700 USD 226700 US 0 \n",
"... ... ... ... ... ... \n",
"2706 160000 USD 160000 US 100 \n",
"928 200000 USD 200000 US 0 \n",
"564 140000 USD 140000 US 0 \n",
"716 297300 USD 297300 US 100 \n",
"1299 133832 USD 133832 US 0 \n",
"\n",
" company_location company_size above_median_salary salary_category \n",
"1809 US M 1 1 \n",
"1082 US M 0 1 \n",
"1686 US M 1 1 \n",
"1600 US M 1 1 \n",
"1376 US M 1 2 \n",
"... ... ... ... ... \n",
"2706 US M 1 1 \n",
"928 US M 1 1 \n",
"564 US M 1 1 \n",
"716 US M 1 2 \n",
"1299 US M 0 1 \n",
"\n",
"[3004 rows x 13 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>above_median_salary</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1809</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1082</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1686</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1600</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1376</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2706</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>928</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>564</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>716</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3004 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" above_median_salary\n",
"1809 1\n",
"1082 0\n",
"1686 1\n",
"1600 1\n",
"1376 1\n",
"... ...\n",
"2706 1\n",
"928 1\n",
"564 1\n",
"716 1\n",
"1299 0\n",
"\n",
"[3004 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" <th>above_median_salary</th>\n",
" <th>salary_category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3459</th>\n",
" <td>2022</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Research Scientist</td>\n",
" <td>59000</td>\n",
" <td>EUR</td>\n",
" <td>61989</td>\n",
" <td>AT</td>\n",
" <td>0</td>\n",
" <td>AT</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3724</th>\n",
" <td>2021</td>\n",
" <td>EN</td>\n",
" <td>FT</td>\n",
" <td>Business Data Analyst</td>\n",
" <td>50000</td>\n",
" <td>EUR</td>\n",
" <td>59102</td>\n",
" <td>LU</td>\n",
" <td>100</td>\n",
" <td>LU</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1795</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>180000</td>\n",
" <td>USD</td>\n",
" <td>180000</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3535</th>\n",
" <td>2021</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>50000</td>\n",
" <td>USD</td>\n",
" <td>50000</td>\n",
" <td>NG</td>\n",
" <td>100</td>\n",
" <td>NG</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3255</th>\n",
" <td>2022</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Analyst</td>\n",
" <td>106260</td>\n",
" <td>USD</td>\n",
" <td>106260</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1943</th>\n",
" <td>2022</td>\n",
" <td>MI</td>\n",
" <td>FT</td>\n",
" <td>Data Engineer</td>\n",
" <td>120000</td>\n",
" <td>USD</td>\n",
" <td>120000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>573</th>\n",
" <td>2023</td>\n",
" <td>EN</td>\n",
" <td>FT</td>\n",
" <td>Autonomous Vehicle Technician</td>\n",
" <td>7000</td>\n",
" <td>USD</td>\n",
" <td>7000</td>\n",
" <td>GH</td>\n",
" <td>0</td>\n",
" <td>GH</td>\n",
" <td>S</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3013</th>\n",
" <td>2022</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Machine Learning Engineer</td>\n",
" <td>129300</td>\n",
" <td>USD</td>\n",
" <td>129300</td>\n",
" <td>US</td>\n",
" <td>0</td>\n",
" <td>US</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>327</th>\n",
" <td>2023</td>\n",
" <td>EN</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>70000</td>\n",
" <td>CAD</td>\n",
" <td>51753</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>L</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1565</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Analyst</td>\n",
" <td>48000</td>\n",
" <td>EUR</td>\n",
" <td>51508</td>\n",
" <td>ES</td>\n",
" <td>0</td>\n",
" <td>ES</td>\n",
" <td>M</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>751 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type \\\n",
"3459 2022 MI FT \n",
"3724 2021 EN FT \n",
"1795 2023 SE FT \n",
"3535 2021 MI FT \n",
"3255 2022 MI FT \n",
"... ... ... ... \n",
"1943 2022 MI FT \n",
"573 2023 EN FT \n",
"3013 2022 SE FT \n",
"327 2023 EN FT \n",
"1565 2023 SE FT \n",
"\n",
" job_title salary salary_currency salary_in_usd \\\n",
"3459 Research Scientist 59000 EUR 61989 \n",
"3724 Business Data Analyst 50000 EUR 59102 \n",
"1795 Data Engineer 180000 USD 180000 \n",
"3535 Data Scientist 50000 USD 50000 \n",
"3255 Data Analyst 106260 USD 106260 \n",
"... ... ... ... ... \n",
"1943 Data Engineer 120000 USD 120000 \n",
"573 Autonomous Vehicle Technician 7000 USD 7000 \n",
"3013 Machine Learning Engineer 129300 USD 129300 \n",
"327 Data Scientist 70000 CAD 51753 \n",
"1565 Data Analyst 48000 EUR 51508 \n",
"\n",
" employee_residence remote_ratio company_location company_size \\\n",
"3459 AT 0 AT L \n",
"3724 LU 100 LU L \n",
"1795 US 0 US M \n",
"3535 NG 100 NG L \n",
"3255 US 0 US M \n",
"... ... ... ... ... \n",
"1943 US 100 US M \n",
"573 GH 0 GH S \n",
"3013 US 0 US M \n",
"327 CA 100 CA L \n",
"1565 ES 0 ES M \n",
"\n",
" above_median_salary salary_category \n",
"3459 0 0 \n",
"3724 0 0 \n",
"1795 1 1 \n",
"3535 0 0 \n",
"3255 0 1 \n",
"... ... ... \n",
"1943 0 1 \n",
"573 0 0 \n",
"3013 0 1 \n",
"327 0 0 \n",
"1565 0 0 \n",
"\n",
"[751 rows x 13 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>above_median_salary</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3459</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3724</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1795</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3535</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3255</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1943</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>573</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3013</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>327</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1565</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>751 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" above_median_salary\n",
"3459 0\n",
"3724 0\n",
"1795 1\n",
"3535 0\n",
"3255 0\n",
"... ...\n",
"1943 0\n",
"573 0\n",
"3013 0\n",
"327 0\n",
"1565 0\n",
"\n",
"[751 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"work_year int64\n",
"experience_level object\n",
"employment_type object\n",
"job_title object\n",
"salary int64\n",
"salary_currency object\n",
"salary_in_usd int64\n",
"employee_residence object\n",
"remote_ratio int64\n",
"company_location object\n",
"company_size object\n",
"above_median_salary int64\n",
"salary_category category\n",
"dtype: object\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACHyElEQVR4nOzdd3xV9eH/8dcdyc3emySMEEiYCgrGAQ4UETeto6DWtmottq4qddTZlqr91VWr9tsqttW6dxVFpiKCskdYYVwgZNzsndx7z++PwK2RYW5Icm6S9/PxuA+4555zz/sml3DfOed8PhbDMAxERERERESk3axmBxAREREREelpVKRERERERET8pCIlIiIiIiLiJxUpERERERERP6lIiYiIiIiI+ElFSkRERERExE8qUiIiIiIiIn5SkRIREREREfGTipSIiIiIiIifVKRERERERET8pCIlItJN5syZg8Vi8d1CQkIYMmQIN910E8XFxWbHExERET/YzQ4gItLXPPTQQwwcOJDGxka++OILnn32WT766CM2bNhAWFiY2fFERESkHVSkRES62ZQpUzjhhBMA+NnPfkZ8fDx//vOfee+997jyyitNTiciIiLtoVP7RERMduaZZwKwc+dOAMrLy/n1r3/NyJEjiYiIICoqiilTprB27dpDtm1sbOSBBx5gyJAhhISEkJqayqWXXkpBQQEAu3btanM64Xdvp59+uu+5Fi1ahMVi4bXXXuPuu+8mJSWF8PBwLrzwQvbs2XPIvpcvX865555LdHQ0YWFhTJw4kaVLlx72NZ5++umH3f8DDzxwyLr//ve/GTt2LKGhocTFxXHFFVccdv9He23f5vV6eeKJJxg+fDghISEkJydzww03UFFR0Wa9AQMGcP755x+yn5tuuumQ5zxc9scee+yQrylAU1MT999/P4MHD8bhcJCRkcGdd95JU1PTYb9W3/bzn/+c7OxswsLCiIuL48wzz+Tzzz9vs857773H1KlTSUtLw+FwkJWVxcMPP4zH42mz3umnn86IESNYuXIlJ598MqGhoQwcOJDnnnuuzXoH3wdHuv34xz8+JOeRvr9z5szxrfP555/zwx/+kMzMTN/X4dZbb6WhocG3zo9//OOj7ttisbBr167v/bqJiHQHHZESETHZwdITHx8PwI4dO3j33Xf54Q9/yMCBAykuLub5559n4sSJbNq0ibS0NAA8Hg/nn38+8+fP54orruDmm2+mpqaGefPmsWHDBrKysnz7uPLKKznvvPPa7Peuu+46bJ7f//73WCwWZs2aRUlJCU888QSTJk1izZo1hIaGArBgwQKmTJnC2LFjuf/++7Farbz44ou+D/rjxo075HnT09OZPXs2ALW1tdx4442H3fdvf/tbLrvsMn72s59RWlrK008/zYQJE1i9ejUxMTGHbHP99ddz2mmnAfD222/zzjvvtHn8hhtuYM6cOVx77bX86le/YufOnfzlL39h9erVLF26lKCgoMN+HfxRWVnpe23f5vV6ufDCC/niiy+4/vrryc3NZf369Tz++ONs3bqVd99996jP29zczIwZM0hPT6e8vJznn3+ec889l/z8fDIzM4HWa+8iIiK47bbbiIiIYMGCBdx3331UV1fz2GOPtXm+iooKzjvvPC677DKuvPJKXn/9dW688UaCg4P5yU9+0mbdX/3qV5x44oltlv3sZz87YtacnBzuueceAFwuF7feemubx9944w3q6+u58cYbiY+PZ8WKFTz99NPs3buXN954A2j9Xk2aNMm3zVVXXcUll1zCpZde6luWmJh41K+ZiEi3MUREpFu8+OKLBmB89tlnRmlpqbFnzx7j1VdfNeLj443Q0FBj7969hmEYRmNjo+HxeNpsu3PnTsPhcBgPPfSQb9kLL7xgAMaf//znQ/bl9Xp92wHGY489dsg6w4cPNyZOnOi7v3DhQgMw+vXrZ1RXV/uWv/766wZgPPnkk77nzs7ONiZPnuzbj2EYRn19vTFw4EDj7LPPPmRfJ598sjFixAjf/dLSUgMw7r//ft+yXbt2GTabzfj973/fZtv169cbdrv9kOXbtm0zAOOll17yLbv//vuNb//X9vnnnxuA8fLLL7fZdu7cuYcs79+/vzF16tRDss+cOdP47n+X381+5513GklJScbYsWPbfE3/9a9/GVar1fj888/bbP/cc88ZgLF06dJD9nc0K1asMADjzTff9C2rr68/ZL0bbrjBCAsLMxobG33LJk6caADG//t//8+3rKmpyTjuuOOMpKQko7m52TCM/70P3njjjUOeNzw83LjmmmsOWX7KKacYZ5xxhu/+wffdiy++eNScs2fPNiwWi7F79+7Dvt7vfp1FRAKJTu0TEelmkyZNIjExkYyMDK644goiIiJ455136NevHwAOhwOrtfXHs8fjoaysjIiICIYOHcqqVat8z/PWW2+RkJDAL3/5y0P28d1T0fxx9dVXExkZ6bv/gx/8gNTUVD766CMA1qxZw7Zt2/jRj35EWVkZLpcLl8tFXV0dZ511FkuWLMHr9bZ5zsbGRkJCQo6637fffhuv18tll13me06Xy0VKSgrZ2dksXLiwzfrNzc1A69frSN544w2io6M5++yz2zzn2LFjiYiIOOQ5W1pa2qzncrlobGw8au59+/bx9NNP89vf/paIiIhD9p+bm0tOTk6b5zx4Oud39384jY2NuFwu8vPzefLJJwkNDfVdYwf4jhIC1NTU4HK5OO2006ivr2fz5s1tnstut3PDDTf47gcHB3PDDTdQUlLCypUrvzfLkTQ3Nx/1+/DdnHV1dbhcLk4++WQMw2D16tUd3reIiFl0ap+ISDd75plnGDJkCHa7neTkZIYOHeorTtB6OtiTTz7JX//6V3bu3NnmWpeDp/9B6ymBQ4cOxW7v3B/l2dnZbe5bLBYGDx7suzZl27ZtAFxzzTVHfI6qqipiY2N9910u1yHP+13btm3DMIwjrvfdU/AqKysBDikv333OqqoqkpKSDvt4SUlJm/uffvqp36eO3X///aSlpXHDDTfw5ptvHrL//Pz8Iz7nd/d/OHPmzPGdBpmSksK8efPo37+/7/GNGzdy7733smDBAqqrq9tsW1VV1eZ+Wloa4eHhbZYNGTIEaL3m7KSTTvrePIdTWVnZJtPhOJ1O7rvvPt5///1Drk/7bk4RkZ5ARUpEpJuNGzeuzRGF7/rDH/7Ab3/7W37yk5/w8MMPExcXh9Vq5ZZbbjnkSI8ZDmZ47LHHOO644w67zrfLTXNzM/v37+fss8/+3ue1WCx8/PHH2Gy2oz4nQFFREdBaLo72nElJSbz88suHffy7BWf8+PH87ne/a7PsL3/5C++9995ht8/Pz2fOnDn8+9//Puy1Vl6vl5EjR/LnP//5sNtnZGQcMftBF1xwAYMHD6akpITnnnuOyy+/nC+++IIBAwZQWVnJxIkTiYqK4qGHHiIrK4uQkBBWrVrFrFmzuu39UlRUxOTJk4/4uMfj4eyzz6a8vJxZs2aRk5NDeHg4+/bt48c//nFAvK9FRPylIiUiEmDefPNNzjjjDP7xj3+0WV5ZWUlCQoLvflZWFsuXL6elpaVTBkw46OARp4MMw2D79u2MGjXKt1+AqKioNgMDHMnatWtpaWk5ank8+LyGYTBw4EDfUZKj2bRpExaLhaFDhx71OT/77DNOOeWUNqeWHUlCQsIhr+loA0LcddddHHfccVx++eVH3P/atWs566yzOny6Zb9+/XynfV566aUkJCTw7LPP8sgjj7Bo0SLKysp4++23mTBhgm+bgyNAfldhYSF1dXVtjkpt3boVaB21sCP27t1LTU0Nubm5R1xn/fr1bN26lZdeeomrr77at3zevHkd2qeISCDQNVIiIgHGZrNhGEabZW+88Qb79u1rs2zatGm4XC7+8pe/HPIc393eH//85z+pqanx3X/zzTfZv38/U6ZMAWDs2LFkZWXxpz/9idra2kO2Ly0tPSS7zWY77NDi33bppZdis9l48MEHD8lvGAZlZWW++263m7feeotx48Yd9dS+yy67DI/Hw8MPP3zIY26
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2wAAAIjCAYAAAB/FZhcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABu0UlEQVR4nO3deVxUZf//8feAsrgAIsqSgpA7SpqWkYVZlhalpbkkla2m0qatJkoa5m1lpgkudauZdFvWXWaaSVpShpnbTbm1SFIhqKFgoqAz5/dHP+brCCQoMAd4PR+PecSc65oz79lsPnNd5zoWwzAMAQAAAABMx8XZAQAAAAAApaNgAwAAAACTomADAAAAAJOiYAMAAAAAk6JgAwAAAACTomADAAAAAJOiYAMAAAAAk6JgAwAAAACTomADAAAAAJOiYAMAAAAAk6JgA1BjzZs3T3379pW/v7/q16+vgIAA9erVS0uWLJHNZnN2PAAAgAtmMQzDcHYIADgfkZGRCgwM1LXXXisvLy8dPXpUmzZt0rJlyzR06FD95z//cXZEAACAC0LBBqDGOnXqlOrXr19i+yOPPKI5c+YoIyNDrVq1qv5gAAAAlYQpkQBqrNKKNUn2Is3F5f/+iVuxYoWio6MVFBQkd3d3XXzxxXrhhRdktVodbnvNNdfIYrHYL35+foqOjtYPP/zg0M9isej555932Pbyyy/LYrHommuucdh+8uRJPf/882rbtq08PDwUGBiogQMH6pdffpEk/frrr7JYLFq8eLHD7WJjY2WxWHTPPffYty1evFgWi0Vubm46dOiQQ/+0tDR77i1btji0LV++XN26dZOnp6f8/Px055136o8//ijx3O3Zs0dDhgxRs2bN5OnpqXbt2mnChAmSpOeff97huSnt8uWXX9qfx06dOpXYf3mMGjVKbdq0UYMGDeTr66trr71WX331lUOfiryenTp10tatW3XllVfK09NToaGhmjdvnkO/L7/88h8f15mvwZn7Lq3vma/jV199pcGDBys4OFju7u5q2bKlxo4dqxMnTtj73HPPPed8Xn/99VdJf7+3b7755jKfu+LHUfw6/JPt27frxhtvlJeXlxo1aqTrrrtOmzZtsrcXv9f+6XL2e7bYvn37ZLFYNHPmzBJt33zzjSwWi30EvPh9Vfze8/LyUtOmTfXYY4/p5MmTDrc9ffq0XnjhBV188cVyd3dXq1at9Nxzz6mwsNChX6tWrewZXVxcFBAQoKFDhyozM9Pep/hz98orr5TI2KlTpxKf43379mnw4MEKCgqSi4uLff/leZ+fT+7SLmeyWCx6+OGHy7zP4tfvzPfOP+37zB+3XnnlFV155ZVq2rSpPD091a1bN73//vvnfJwAqkY9ZwcAgAt19OhRnT59WseOHdPWrVv1yiuvaNiwYQoODrb3Wbx4sRo1aqRx48apUaNGWr9+vSZNmqT8/Hy9/PLLDvtr3769JkyYIMMw9Msvv+jVV1/VTTfd5PBlr7QM06ZNK7HdarXq5ptv1rp16zRs2DA99thjOnbsmFJSUvTDDz/o4osvLnV/P//8s954440y78/V1VVLly7V2LFj7dsWLVokDw+PEl9yFy9erHvvvVeXXXaZpk2bppycHM2aNUsbN27U9u3b5ePjI0lKT0/X1Vdfrfr162vkyJFq1aqVfvnlF61cuVJTp07VwIED1bp1a/t+x44dqw4dOmjkyJH2bR06dCgzc3kVFRXpzjvvVIsWLZSbm6v58+erX79+2r17t/01rcjreeTIEd10000aMmSI7rjjDr333nsaPXq03NzcdN999zn0ffTRR3XZZZc5bHvggQfKzFr8XpGkw4cPO7we0t+FckFBgUaPHq2mTZtq8+bNev311/X7779r+fLlkqSHHnpIffr0sd/mrrvu0m233aaBAwfatzVr1qy8T1+57Ny5U1dffbW8vLz09NNPq379+po/f76uueYabdiwQT169FBUVJTefvtt+22mTp0qSfbHK0lXXnllqfsPCwtTz549lZycXOI5SU5OVuPGjTVgwACH7UOGDFGrVq00bdo0bdq0SbNnz9aRI0e0ZMkSe58HHnhAb731lm6//XY98cQT+vbbbzVt2jTt3r1bH374ocP+rr76ao0cOVI2m00//PCDXnvtNWVlZZUo/svDarWqf//+2r9/vx5//HG1bdtWFovF/pycS0Vyd+nSRU888YTDtiVLliglJaXCuc/02muv6a+//pIk7d69Wy+++KKee+45+2e2UaNG9r6zZs1S//79FRMTo6KiIi1btkyDBw/WJ598oujo6AvKAeA8GABQw7Vr186QZL/cfffdxqlTpxz6FBQUlLjdQw89ZDRo0MA4efKkfVuvXr2MXr16OfR77rnnDEnGwYMH7dskGfHx8fbrTz/9tNG8eXOjW7duDrdfuHChIcl49dVXS9y/zWYzDMMwMjIyDEnGokWL7G1DhgwxOnXqZLRs2dIYMWKEffuiRYsMScYdd9xhdO7c2b79+PHjhpeXlzF8+HBDkvHdd98ZhmEYRUVFRvPmzY1OnToZJ06csPf/5JNPDEnGpEmT7NuioqKMxo0bG/v37y8159lCQkIcsp2pV69eRnh4eKltFbV582ZDkvH+++/bt1Xk9ZRkzJgxw76tsLDQ6NKli9G8eXOjqKjIMAzD+OKLLwxJxvLly0vst2HDhqU+zp49exq9e/e2Xy/tdSwt57Rp0wyLxVLieS529nvrTCEhIUZ0dHSpbWc+ji+++KLMPoZhGLfeeqvh5uZm/PLLL/ZtWVlZRuPGjY2oqKhSb1PaZ+OfzJ8/35Bk7N69276tqKjI8PPzc3g+4+PjDUlG//79HW4/ZswYQ5Lxv//9zzAMw9ixY4chyXjggQcc+j355JOGJGP9+vX2baW9N4cPH240aNDAfr349Xr55ZdLZA8PD3d4rHv37jUkGdOmTXPoV573eUVzl/b6xsbGGmd/ZZNkxMbGlnm/xf9WZGRklGg71/vk7PdtUVGR0alTJ+Paa68t8/4AVB2mRAKo8RYtWqSUlBQlJyfr/vvvV3JyssOojyR5enra/z527JgOHz6sq6++WgUFBdqzZ49D31OnTunw4cM6dOiQ0tLS9OGHHyoiIkJ+fn6l3v8ff/yh119/XRMnTnT4lVqSPvjgA/n5+emRRx4pcbuzpzgV27p1q5YvX65p06Y5TOs801133aU9e/bYpz5+8MEH8vb21nXXXefQb8uWLTp48KDGjBkjDw8P+/bo6Gi1b99eq1atkiQdOnRIqampuu+++xxGJv8p57lYrVYdPnxYhw8fVlFRUYVue/LkSR0+fFi7d+/WrFmz5Onpqe7du9vbK/J61qtXTw899JD9upubmx566CEdPHhQW7duPa/HJv09Euju7v6Pfc7Mefz4cR0+fFhXXnmlDMPQ9u3bz+t+i9+ff/75p06fPl3h21utVq1du1a33nqrwsLC7NsDAwM1fPhwff3118rPzz+vbGcaMmSIPDw8lJycbN/22Wef6fDhw7rzzjtL9I+NjXW4XvyZWb16tcN/x40b59CveDSq+L1crLCwUIcPH9bBgweVkpKi9evXl/h8SFJBQYH9fVp8OXtq7bFjxyRJTZs2PfcDP0tFc1dE8efkzz//rNSVcc983x45ckR5eXm6+uqrtW3btkq7DwDlR8EGoMaLjIxUnz59NHz4cL355puaMmWKFi1apI0bN9r77Ny5U7fddpu8vb3l5eWlZs2a2b805uXlOezvm2++UbNmzdS8eXNdeeWVOn36tJYvX15m4RIfH6+goCCHoqDYL7/8onbt2qlevfLPQH/22Wd19dVX/+OxSs2aNVN0dLQWLlwoSVq4cKFGjBhRosDbv3+/JKldu3Yl9tG+fXt7+759+yTpvI87K82ePXvUrFkzh+Ph3nnnnXLddvHixWrWrJk6duyodevWKSUlRSEhIfb2iryeQUFBatiwocO2tm3bSpL9+J7zcfTo0RIF+tkyMzN1zz33yNfXV40aNVKzZs3Uq1evUnOW19q1a9WsWTP5+fnJw8NDl156qdauXVvu2x86dEg
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Создание целевого признака\n",
"median_salary = df['salary_in_usd'].median()\n",
"df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)\n",
"\n",
"# Разделение на признаки и целевую переменную\n",
"X = df.drop(columns=['salary_in_usd', 'above_median_salary'])\n",
"y = df['above_median_salary']\n",
"\n",
"# Примерная категоризация\n",
"df['salary_category'] = pd.cut(df['salary_in_usd'], bins=[0, 100000, 200000, np.inf], labels=[0, 1, 2])\n",
"\n",
"# Выбор признаков и целевых переменных\n",
"X = df.drop(columns=['salary_in_usd', 'salary_category'])\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
" \n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
" \n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" if frac_val <= 0:\n",
" assert len(df_input) == len(df_train) + len(df_temp)\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
"\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"above_median_salary\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)\n",
"\n",
"# Проверка преобразования\n",
"print(df.dtypes)\n",
"\n",
"# Визуализация распределения зарплат\n",
"plt.figure(figsize=(10, 6))\n",
"sns.histplot(df['salary_in_usd'], bins=50, kde=True)\n",
"plt.title('Распределение зарплат')\n",
"plt.xlabel('Зарплата (USD)')\n",
"plt.ylabel('Частота')\n",
"plt.show()\n",
"\n",
"# Визуализация зависимости между зарплатой и уровнем опыта\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x='experience_level', y='salary_in_usd', data=df)\n",
"plt.title('Зависимость зарплаты от уровня опыта')\n",
"plt.xlabel('Уровень опыта')\n",
"plt.ylabel('Зарплата (USD)')\n",
"plt.show()"
]
},
2024-11-09 11:59:00 +04:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
2024-11-23 12:17:48 +04:00
"ename": "IndexError",
"evalue": "Index dimension must be 1 or 2",
2024-11-09 11:59:00 +04:00
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2024-11-23 12:17:48 +04:00
"\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[14], line 71\u001b[0m\n\u001b[0;32m 62\u001b[0m pipeline_end \u001b[38;5;241m=\u001b[39m Pipeline(\n\u001b[0;32m 63\u001b[0m [\n\u001b[0;32m 64\u001b[0m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfeatures_preprocessing\u001b[39m\u001b[38;5;124m\"\u001b[39m, features_preprocessing),\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 67\u001b[0m ]\n\u001b[0;32m 68\u001b[0m )\n\u001b[0;32m 70\u001b[0m \u001b[38;5;66;03m# Демонстрация работы конвейера для предобработки данных при классификации\u001b[39;00m\n\u001b[1;32m---> 71\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline_end\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 72\u001b[0m preprocessed_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 73\u001b[0m preprocessing_result,\n\u001b[0;32m 74\u001b[0m columns\u001b[38;5;241m=\u001b[39mpipeline_end\u001b[38;5;241m.\u001b[39mget_feature_names_out(),\n\u001b[0;32m 75\u001b[0m )\n\u001b[0;32m 77\u001b[0m preprocessed_df\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py:533\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model and transform with the final estimator.\u001b[39;00m\n\u001b[0;32m 491\u001b[0m \n\u001b[0;32m 492\u001b[0m \u001b[38;5;124;03mFit all the transformers one after the other and sequentially transform\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[38;5;124;03m Transformed samples.\u001b[39;00m\n\u001b[0;32m 531\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 532\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 533\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrouted_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 535\u001b[0m last_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py:406\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 404\u001b[0m cloned_transformer \u001b[38;5;241m=\u001b[39m clone(transformer)\n\u001b[0;32m 405\u001b[0m \u001b[38;5;66;03m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 406\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m \u001b[43mfit_transform_one_cached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mcloned_transformer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPipeline\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep_idx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 413\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 414\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 415\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 416\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\pipeline.py:1310\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1310\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfit_transform\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1311\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1312\u001b[0m res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[0;32m 1313\u001b[0m X, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\n\u001b[0;32m 1314\u001b[0m )\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1098\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m 1083\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1084\u001b[0m (\n\u001b[0;32m 1085\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis object (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) has a `transform`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;167;01mUserWarning\u001b[39;00m,\n\u001b[0;32m 1094\u001b[0m )\n\u001b[0;32m 1096\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1097\u001b[0m \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[1;32m-> 1098\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1099\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1100\u001b[0m \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n",
"Cell \u001b[1;32mIn[14], line 18\u001b[0m, in \u001b[0;36mSalaryFeatures.transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtransform\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, y\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# Создание новых признаков\u001b[39;00m\n\u001b[0;32m 17\u001b[0m X \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m---> 18\u001b[0m X[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwork_year_to_remote_ratio\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mwork_year\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m/\u001b[39m X[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mremote_ratio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_csr.py:24\u001b[0m, in \u001b[0;36m_csr_base.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m---> 24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 27\u001b[0m key \u001b[38;5;241m=\u001b[39m key[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_index.py:52\u001b[0m, in \u001b[0;36mIndexMixin.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[1;32m---> 52\u001b[0m row, col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_indices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 54\u001b[0m \u001b[38;5;66;03m# Dispatch to specialized methods.\u001b[39;00m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(row, INT_TYPES):\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_index.py:186\u001b[0m, in \u001b[0;36mIndexMixin._validate_indices\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 184\u001b[0m row \u001b[38;5;241m=\u001b[39m _validate_bool_idx(bool_row, M, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(row, \u001b[38;5;28mslice\u001b[39m):\n\u001b[1;32m--> 186\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_asindices\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mM\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m isintlike(col):\n\u001b[0;32m 189\u001b[0m col \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(col)\n",
"File \u001b[1;32md:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\scipy\\sparse\\_index.py:212\u001b[0m, in \u001b[0;36mIndexMixin._asindices\u001b[1;34m(self, idx, length)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minvalid index\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m x\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m):\n\u001b[1;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIndex dimension must be 1 or 2\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m x\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n",
"\u001b[1;31mIndexError\u001b[0m: Index dimension must be 1 or 2"
2024-11-09 11:59:00 +04:00
]
}
],
"source": [
2024-11-23 12:17:48 +04:00
"import numpy as np\n",
2024-11-09 11:59:00 +04:00
"import pandas as pd\n",
2024-11-23 12:17:48 +04:00
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Создание целевого признака\n",
"median_salary = df['salary_in_usd'].median()\n",
"df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)\n",
"\n",
"# Разделение на признаки и целевую переменную\n",
"X = df.drop(columns=['salary_in_usd', 'above_median_salary'])\n",
"y = df['above_median_salary']\n",
"\n",
"# Разделение данных на тренировочный и тестовый наборы\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
"\n",
"# Построение конвейеров предобработки\n",
"\n",
"class SalaryFeatures(BaseEstimator, TransformerMixin):\n",
" def __init__(self):\n",
" pass\n",
" def fit(self, X, y=None):\n",
" return self\n",
" def transform(self, X, y=None):\n",
" # Создание новых признаков\n",
" X = X.copy()\n",
" X[\"work_year_to_remote_ratio\"] = X[\"work_year\"] / X[\"remote_ratio\"]\n",
" return X\n",
" def get_feature_names_out(self, features_in):\n",
" # Добавление имен новых признаков\n",
" new_features = [\"work_year_to_remote_ratio\"]\n",
" return np.append(features_in, new_features, axis=0)\n",
"\n",
"# Обработка числовых данных. Числовой конвейер: заполнение пропущенных значений медианой и стандартизация\n",
"preprocessing_num_class = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
"])\n",
"\n",
"# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding\n",
"preprocessing_cat_class = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
"])\n",
"\n",
"# Определение столбцов\n",
"numeric_columns = [\"work_year\", \"salary\", \"salary_in_usd\", \"remote_ratio\"]\n",
"cat_columns = [\"experience_level\", \"employment_type\", \"job_title\", \"salary_currency\", \"employee_residence\", \"company_location\", \"company_size\"]\n",
"\n",
"# Предобработка признаков\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"# Удаление колонок\n",
"columns_to_drop = [] # Укажите столбцы, которые нужно удалить, если они есть\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"# Основной конвейер предобработки данных и конструирования признаков\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"custom_features\", SalaryFeatures()),\n",
" (\"drop_columns\", drop_columns),\n",
" ]\n",
")\n",
"\n",
"# Демонстрация работы конвейера для предобработки данных при классификации\n",
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"\n",
"# Получение имен столбцов после преобразования\n",
"feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out(numeric_columns + cat_columns)\n",
"feature_names = np.append(feature_names, [\"work_year_to_remote_ratio\"])\n",
"\n",
"# Создание DataFrame с преобразованными данными\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=feature_names,\n",
")\n",
"\n",
"preprocessed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Бизнес-цели**\n",
"\n",
"1. Предсказание заработной платы (Регрессия)\n",
"\n",
" Цель: Предсказать заработную плату (salary_in_usd) на основе других характеристик, таких как уровень опыта (experience_level), тип занятости (employment_type), должность (job_title), место проживания сотрудника (employee_residence), размер компании (company_size) и другие факторы.\n",
"\n",
" Применение: Это может быть полезно для HR-отделов, которые хотят оценить справедливую зарплату для новых сотрудников или для анализа рынка труда.\n",
"\n",
"2. Классификация уровня опыта по зарплате (Классификация)\n",
"\n",
" Цель: Классифицировать уровень опыта (experience_level) на основе заработной платы (salary_in_usd) и других факторов.\n",
"\n",
" Применение: Это может помочь в оценке, на каком уровне опыта находится сотрудник, основываясь на е г о зарплате, что может быть полезно для оценки карьерного роста."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование зарплаты"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" work_year experience_level employment_type job_title \\\n",
"0 2023 SE FT Principal Data Scientist \n",
"1 2023 MI CT ML Engineer \n",
"2 2023 MI CT ML Engineer \n",
"3 2023 SE FT Data Scientist \n",
"4 2023 SE FT Data Scientist \n",
"\n",
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
"0 80000 EUR 85847 ES 100 \n",
"1 30000 USD 30000 US 100 \n",
"2 25500 USD 25500 US 100 \n",
"3 175000 USD 175000 CA 100 \n",
"4 120000 USD 120000 CA 100 \n",
"\n",
" company_location company_size \n",
"0 ES L \n",
"1 US S \n",
"2 US S \n",
"3 CA M \n",
"4 CA M \n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 3755 entries, 0 to 3754\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 work_year 3755 non-null int64 \n",
" 1 experience_level 3755 non-null object\n",
" 2 employment_type 3755 non-null object\n",
" 3 job_title 3755 non-null object\n",
" 4 salary 3755 non-null int64 \n",
" 5 salary_currency 3755 non-null object\n",
" 6 salary_in_usd 3755 non-null int64 \n",
" 7 employee_residence 3755 non-null object\n",
" 8 remote_ratio 3755 non-null int64 \n",
" 9 company_location 3755 non-null object\n",
" 10 company_size 3755 non-null object\n",
"dtypes: int64(4), object(7)\n",
"memory usage: 322.8+ KB\n",
"None\n",
" work_year salary salary_in_usd remote_ratio\n",
"count 3755.000000 3.755000e+03 3755.000000 3755.000000\n",
"mean 2022.373635 1.906956e+05 137570.389880 46.271638\n",
"std 0.691448 6.716765e+05 63055.625278 48.589050\n",
"min 2020.000000 6.000000e+03 5132.000000 0.000000\n",
"25% 2022.000000 1.000000e+05 95000.000000 0.000000\n",
"50% 2022.000000 1.380000e+05 135000.000000 0.000000\n",
"75% 2023.000000 1.800000e+05 175000.000000 100.000000\n",
"max 2023.000000 3.040000e+07 450000.000000 100.000000\n",
"work_year 0\n",
"experience_level 0\n",
"employment_type 0\n",
"job_title 0\n",
"salary 0\n",
"salary_currency 0\n",
"salary_in_usd 0\n",
"employee_residence 0\n",
"remote_ratio 0\n",
"company_location 0\n",
"company_size 0\n",
"dtype: int64\n",
"Mean Squared Error: 2482079980.9527493\n",
"R^2 Score: 0.37127352660208646\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"import seaborn as sns\n",
2024-11-09 11:59:00 +04:00
"import matplotlib.pyplot as plt\n",
2024-11-23 12:17:48 +04:00
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Устанавливаем случайное состояние\n",
"random_state = 42\n",
"\n",
"# Предварительный анализ данных\n",
"print(df.head())\n",
"print(df.info())\n",
"print(df.describe())\n",
"\n",
"# Проверка на пропущенные значения\n",
"print(df.isnull().sum())\n",
"\n",
"# Предобработка данных\n",
"# Определяем категориальные и числовые столбцы\n",
"categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n",
"numeric_features = ['work_year', 'remote_ratio']\n",
"\n",
"# Создаем пайплайн для обработки данных\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numeric_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n",
"\n",
"# Определяем целевую переменную и признаки\n",
"X = df.drop('salary_in_usd', axis=1)\n",
"y = df['salary_in_usd']\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)\n",
"\n",
"# Создаем и обучаем модель\n",
"model = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('regressor', LinearRegression())])\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Делаем предсказания на тестовой выборке\n",
"y_pred = model.predict(X_test)\n",
"\n",
"# Оцениваем качество модели\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"r2 = r2_score(y_test, y_pred)\n",
"\n",
"print(f\"Mean Squared Error: {mse}\")\n",
"print(f\"R^2 Score: {r2}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Классифицировать уровень опыта"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" EN 0.55 0.48 0.51 67\n",
" EX 0.46 0.26 0.33 23\n",
" MI 0.48 0.54 0.51 157\n",
" SE 0.83 0.83 0.83 504\n",
"\n",
" accuracy 0.72 751\n",
" macro avg 0.58 0.53 0.55 751\n",
"weighted avg 0.72 0.72 0.72 751\n",
"\n",
"Confusion Matrix:\n",
"[[ 32 0 20 15]\n",
" [ 0 6 5 12]\n",
" [ 14 0 84 59]\n",
" [ 12 7 65 420]]\n",
"Accuracy Score: 0.7217043941411452\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhsAAAHHCAYAAAAWM5p0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABVJElEQVR4nO3deXwM5x8H8M9ujs193yVuIcTR0Ij7CEEccVSVEqq0aaIIqlFH0IoGdRNtHakKbRUt6la0FURIEaSoNlQOCTlEsons/P5Q++s2QcJOZpP9vPua18s+88zMd5KSb77P88zIBEEQQERERCQSudQBEBERUfXGZIOIiIhExWSDiIiIRMVkg4iIiETFZIOIiIhExWSDiIiIRMVkg4iIiETFZIOIiIhExWSDiIiIRMVkg0hEV69eRY8ePWBtbQ2ZTIadO3dq9fx//vknZDIZNm7cqNXzVmWdO3dG586dpQ6DiP6FyQZVe9evX8fbb7+NunXrwsTEBFZWVmjXrh2WLVuGgoICUa8dFBSECxcu4OOPP8amTZvQqlUrUa9XmUaNGgWZTAYrK6syv45Xr16FTCaDTCbDokWLKnz+27dvIyIiAomJiVqIloikZCh1AERi2rNnD1599VUoFAqMHDkSTZs2RVFREX755RdMnToVSUlJ+Oyzz0S5dkFBAeLi4vDhhx8iNDRUlGvUqlULBQUFMDIyEuX8z2JoaIgHDx5g165dGDJkiMa+zZs3w8TEBIWFhc917tu3b2POnDmoXbs2WrRoUe7jDhw48FzXIyLxMNmgauvGjRsYOnQoatWqhSNHjsDV1VW9LyQkBNeuXcOePXtEu/6dO3cAADY2NqJdQyaTwcTERLTzP4tCoUC7du2wZcuWUslGbGwsAgIC8N1331VKLA8ePICZmRmMjY0r5XpEVH4cRqFqKyoqCvfv38e6des0Eo3H6tevjwkTJqg/P3z4EPPmzUO9evWgUChQu3ZtTJ8+HUqlUuO42rVro0+fPvjll1/wyiuvwMTEBHXr1sWXX36p7hMREYFatWoBAKZOnQqZTIbatWsDeDT88PjP/xYREQGZTKbRdvDgQbRv3x42NjawsLCAh4cHpk+frt7/pDkbR44cQYcOHWBubg4bGxv0798fly9fLvN6165dw6hRo2BjYwNra2uMHj0aDx48ePIX9j+GDRuGvXv3Ijs7W90WHx+Pq1evYtiwYaX63717F1OmTIGXlxcsLCxgZWWFXr164bffflP3OXr0KFq3bg0AGD16tHo45vF9du7cGU2bNkVCQgI6duwIMzMz9dflv3M2goKCYGJiUur+/f39YWtri9u3b5f7Xono+TDZoGpr165dqFu3Ltq2bVuu/m+99RZmzZqFl19+GUuWLEGnTp0QGRmJoUOHlup77do1DB48GN27d8fixYtha2uLUaNGISkpCQAwcOBALFmyBADw+uuvY9OmTVi6dGmF4k9KSkKfPn2gVCoxd+5cLF68GP369cOvv/761OMOHToEf39/ZGRkICIiAmFhYThx4gTatWuHP//8s1T/IUOGIC8vD5GRkRgyZAg2btyIOXPmlDvOgQMHQiaTYfv27eq22NhYNGrUCC+//HKp/n/88Qd27tyJPn364NNPP8XUqVNx4cIFdOrUSf2Dv3Hjxpg7dy4AYNy4cdi0aRM2bdqEjh07qs+TlZWFXr16oUWLFli6dCm6dOlSZnzLli2Do6MjgoKCUFJSAgBYu3YtDhw4gBUrVsDNza3c90pEz0kgqoZycnIEAEL//v3L1T8xMVEAILz11lsa7VOmTBEACEeOHFG31apVSwAgHD9+XN2WkZEhKBQKYfLkyeq2GzduCACEhQsXapwzKChIqFWrVqkYZs+eLfz7r+SSJUsEAMKdO3eeGPfja2zYsEHd1qJFC8HJyUnIyspSt/3222+CXC4XRo4cWep6b775psY5BwwYINjb2z/xmv++D3Nzc0EQBGHw4MFCt27dBEEQhJKSEsHFxUWYM2dOmV+DwsJCoaSkpNR9KBQKYe7cueq2+Pj4Uvf2WKdOnQQAQnR0dJn7OnXqpNG2f/9+AYDw0UcfCX/88YdgYWEhBAYGPvMeiUg7WNmgaik3NxcAYGlpWa7+P/74IwAgLCxMo33y5MkAUGpuh6enJzp06KD+7OjoCA8PD/zxxx/PHfN/PZ7r8f3330OlUpXrmNTUVCQmJmLUqFGws7NTtzdr1gzdu3dX3+e/vfPOOxqfO3TogKysLPXXsDyGDRuGo0ePIi0tDUeOHEFaWlqZQyjAo3kecvmjf3pKSkqQlZWlHiI6e/Zsua+pUCgwevTocvXt0aMH3n77bcydOxcDBw6EiYkJ1q5dW+5rEdGLYbJB1ZKVlRUAIC8vr1z9//rrL8jlctSvX1+j3cXFBTY2Nvjrr7802t3d3Uudw9bWFvfu3XvOiEt77bXX0K5dO7z11ltwdnbG0KFD8c033zw18Xgcp4eHR6l9jRs3RmZmJvLz8zXa/3svtra2AFChe+nduzcsLS3x9ddfY/PmzWjdunWpr+VjKpUKS5YsQYMGDaBQKODg4ABHR0ecP38eOTk55b7mSy+9VKHJoIsWLYKdnR0SExOxfPlyODk5lftYInoxTDaoWrKysoKbmxsuXrxYoeP+O0HzSQwMDMpsFwThua/xeD7BY6ampjh+/DgOHTqEESNG4Pz583jttdfQvXv3Un1fxIvcy2MKhQIDBw5ETEwMduzY8cSqBgDMnz8fYWFh6NixI7766ivs378fBw8eRJMmTcpdwQEefX0q4ty5c8jIyAAAXLhwoULHEtGLYbJB1VafPn1w/fp1xMXFPbNvrVq1oFKpcPXqVY329PR0ZGdnq1eWaIOtra3Gyo3H/ls9AQC5XI5u3brh008/xaVLl/Dxxx/jyJEj+Omnn8o89+M4k5OTS+27cuUKHBwcYG5u/mI38ATDhg3DuXPnkJeXV+ak2se2bduGLl26YN26dRg6dCh69OgBPz+/Ul+T8iZ+5ZGfn4/Ro0fD09MT48aNQ1RUFOLj47V2fiJ6OiYbVG29//77MDc3x1tvvYX09PRS+69fv45ly5YBeDQMAKDUipFPP/0UABAQEKC1uOrVq4ecnBycP39e3ZaamoodO3Zo9Lt7926pYx8/3Oq/y3Efc3V1RYsWLRATE6Pxw/vixYs4cOCA+j7F0KVLF8ybNw8rV66Ei4vLE/sZGBiUqpp8++23+PvvvzXaHidFZSVmFTVt2jSkpKQgJiYGn376KWrXro2goKAnfh2JSLv4UC+qturVq4fY2Fi89tpraNy4scYTRE+cOIFvv/0Wo0aNAgA0b94cQUFB+Oyzz5CdnY1OnTrh9OnTiImJQWBg4BOXVT6PoUOHYtq0aRgwYADee+89PHjwAGvWrEHDhg01JkjOnTsXx48fR0BAAGrVqoWMjAysXr0aNWrUQPv27Z94/oULF6JXr17w9fXFmDFjUFBQgBUrVsDa2hoRERFau4//ksvlmDFjxjP79enTB3PnzsXo0aPRtm1bXLhwAZs3b0bdunU1+tWrVw82NjaIjo6GpaUlzM3N4ePjgzp16lQoriNHjmD16tWYPXu2einuhg0b0LlzZ8ycORNRUVEVOh8RPQeJV8MQie73338Xxo4dK9SuXVswNjYWLC0thXbt2gkrVqwQCgsL1f2Ki4uFOXPmCHXq1BGMjIyEmjVrCuHh4Rp9BOHR0teAgIBS1/nvkssnLX0VBEE4cOCA0LRpU8HY2Fjw8PAQvvrqq1JLXw8fPiz0799fcHNzE4yNjQU3Nzfh9ddfF37//fdS1/jv8tBDhw4J7dq1E0xNTQUrKyuhb9++wqVLlzT6PL7ef5fWbtiwQQAg3Lhx44lfU0HQXPr6JE9a+jp58mTB1dVVMDU1Fdq1ayfExcWVuWT1+++/Fzw9PQVDQ0ON++zUqZPQpEmTMq/57/Pk5uYKtWrVEl5++WWhuLhYo9+kSZMEuVwuxMXFPfUeiOjFyQShArPAiIiIiCqIczaIiIhIVEw2iIiISFRMNoiIiEhUTDaIiIhIVEw2iIiISFRMNoiIiEhUTDaIiIh
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
2024-11-09 11:59:00 +04:00
"import seaborn as sns\n",
2024-11-23 12:17:48 +04:00
"import matplotlib.pyplot as plt\n",
2024-11-09 11:59:00 +04:00
"\n",
2024-11-23 12:17:48 +04:00
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Устанавливаем случайное состояние\n",
"random_state = 42\n",
"\n",
"\n",
"# Предобработка данных\n",
"# Определяем категориальные и числовые столбцы\n",
"categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n",
"numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']\n",
"\n",
"# Создаем пайплайн для обработки данных\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numeric_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n",
"\n",
"# Определяем целевую переменную и признаки\n",
"X = df.drop('experience_level', axis=1)\n",
"y = df['experience_level']\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)\n",
"\n",
"# Создаем и обучаем модель\n",
"model = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', RandomForestClassifier(random_state=random_state))])\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Делаем предсказания на тестовой выборке\n",
"y_pred = model.predict(X_test)\n",
"\n",
"# Оцениваем качество модели\n",
"print(\"Classification Report:\")\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"print(\"Confusion Matrix:\")\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n",
"print(f\"Accuracy Score: {accuracy_score(y_test, y_pred)}\")\n",
"\n",
"# Визуализация результатов\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Ориентир**\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE: 37795.639591701794\n",
"MSE: 2482079980.9527493\n",
"RMSE: 49820.47752634201\n",
"R²: 0.37127352660208646\n",
"Ориентиры для предсказания заработной платы не достигнуты.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\MII\\AIM-PIbd-32-Kaznacheeva-E-K\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Предобработка данных\n",
"categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n",
"numeric_features = ['work_year', 'remote_ratio']\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numeric_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n",
"\n",
"X = df.drop('salary_in_usd', axis=1)\n",
"y = df['salary_in_usd']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"model = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('regressor', LinearRegression())])\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"y_pred = model.predict(X_test)\n",
"\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"\n",
"print(f\"MAE: {mae}\")\n",
"print(f\"MSE: {mse}\")\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"\n",
"# Проверяем, достигнуты ли ориентиры\n",
"if r2 >= 0.75 and mae <= 15000 and rmse <= 20000:\n",
" print(\"Ориентиры для предсказания заработной платы достигнуты!\")\n",
"else:\n",
" print(\"Ориентиры для предсказания заработной платы не достигнуты.\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.7217043941411452\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" EN 0.55 0.48 0.51 67\n",
" EX 0.46 0.26 0.33 23\n",
" MI 0.48 0.54 0.51 157\n",
" SE 0.83 0.83 0.83 504\n",
"\n",
" accuracy 0.72 751\n",
" macro avg 0.58 0.53 0.55 751\n",
"weighted avg 0.72 0.72 0.72 751\n",
"\n",
"Confusion Matrix:\n",
"[[ 32 0 20 15]\n",
" [ 0 6 5 12]\n",
" [ 14 0 84 59]\n",
" [ 12 7 65 420]]\n",
"Ориентиры для классификации уровня опыта не достигнуты.\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"# Предобработка данных\n",
"categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']\n",
"numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numeric_features),\n",
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])\n",
"\n",
"X = df.drop('experience_level', axis=1)\n",
"y = df['experience_level']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"model = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', RandomForestClassifier(random_state=42))])\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"y_pred = model.predict(X_test)\n",
"\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy: {accuracy}\")\n",
"\n",
"print(\"Classification Report:\")\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"print(\"Confusion Matrix:\")\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n",
"# Проверяем, достигнуты ли ориентиры\n",
"if accuracy >= 0.80:\n",
" print(\"Ориентиры для классификации уровня опыта достигнуты!\")\n",
"else:\n",
" print(\"Ориентиры для классификации уровня опыта не достигнуты.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конвейер"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"# Определение столбцов\n",
"numeric_columns = [\"work_year\", \"salary\", \"salary_in_usd\", \"remote_ratio\"]\n",
"cat_columns = [\"experience_level\", \"employment_type\", \"job_title\", \"salary_currency\", \"employee_residence\", \"company_location\", \"company_size\"]\n",
"\n",
"# Обработка числовых данных: заполнение пропущенных значений медианой и стандартизация\n",
"preprocessing_num_class = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
"])\n",
"\n",
"# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding\n",
"preprocessing_cat_class = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
"])\n",
"\n",
"# Объединение всех преобразований в один ColumnTransformer\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num_class, numeric_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat_class, cat_columns),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"# Определение конвейера\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" ]\n",
")\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'train_test_split' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[5], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Разделение данных на тренировочный и тестовый наборы\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m X_train, X_test \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m(df, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Применение конвейера для предобработки данных\u001b[39;00m\n\u001b[0;32m 5\u001b[0m preprocessing_result \u001b[38;5;241m=\u001b[39m pipeline_end\u001b[38;5;241m.\u001b[39mfit_transform(X_train)\n",
"\u001b[1;31mNameError\u001b[0m: name 'train_test_split' is not defined"
]
}
],
"source": [
"# Разделение данных на тренировочный и тестовый наборы\n",
"X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# Применение конвейера для предобработки данных\n",
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"\n",
"# Получение имен столбцов после преобразования\n",
"feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out()\n",
"\n",
"# Создание DataFrame с преобразованными данными\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=feature_names,\n",
")\n",
"\n",
"# Вывод преобразованного DataFrame\n",
"print(preprocessed_df)"
2024-11-09 11:59:00 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}