1159 lines
51 KiB
Plaintext
1159 lines
51 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>work_year</th>\n",
|
|||
|
" <th>experience_level</th>\n",
|
|||
|
" <th>employment_type</th>\n",
|
|||
|
" <th>job_title</th>\n",
|
|||
|
" <th>salary</th>\n",
|
|||
|
" <th>salary_currency</th>\n",
|
|||
|
" <th>salary_in_usd</th>\n",
|
|||
|
" <th>employee_residence</th>\n",
|
|||
|
" <th>remote_ratio</th>\n",
|
|||
|
" <th>company_location</th>\n",
|
|||
|
" <th>company_size</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>SE</td>\n",
|
|||
|
" <td>FT</td>\n",
|
|||
|
" <td>Principal Data Scientist</td>\n",
|
|||
|
" <td>80000</td>\n",
|
|||
|
" <td>EUR</td>\n",
|
|||
|
" <td>85847</td>\n",
|
|||
|
" <td>ES</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>ES</td>\n",
|
|||
|
" <td>L</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>MI</td>\n",
|
|||
|
" <td>CT</td>\n",
|
|||
|
" <td>ML Engineer</td>\n",
|
|||
|
" <td>30000</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>30000</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>MI</td>\n",
|
|||
|
" <td>CT</td>\n",
|
|||
|
" <td>ML Engineer</td>\n",
|
|||
|
" <td>25500</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>25500</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>US</td>\n",
|
|||
|
" <td>S</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>SE</td>\n",
|
|||
|
" <td>FT</td>\n",
|
|||
|
" <td>Data Scientist</td>\n",
|
|||
|
" <td>175000</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>175000</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>M</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" <td>SE</td>\n",
|
|||
|
" <td>FT</td>\n",
|
|||
|
" <td>Data Scientist</td>\n",
|
|||
|
" <td>120000</td>\n",
|
|||
|
" <td>USD</td>\n",
|
|||
|
" <td>120000</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>100</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>M</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" work_year experience_level employment_type job_title \\\n",
|
|||
|
"0 2023 SE FT Principal Data Scientist \n",
|
|||
|
"1 2023 MI CT ML Engineer \n",
|
|||
|
"2 2023 MI CT ML Engineer \n",
|
|||
|
"3 2023 SE FT Data Scientist \n",
|
|||
|
"4 2023 SE FT Data Scientist \n",
|
|||
|
"\n",
|
|||
|
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
|
|||
|
"0 80000 EUR 85847 ES 100 \n",
|
|||
|
"1 30000 USD 30000 US 100 \n",
|
|||
|
"2 25500 USD 25500 US 100 \n",
|
|||
|
"3 175000 USD 175000 CA 100 \n",
|
|||
|
"4 120000 USD 120000 CA 100 \n",
|
|||
|
"\n",
|
|||
|
" company_location company_size \n",
|
|||
|
"0 ES L \n",
|
|||
|
"1 US S \n",
|
|||
|
"2 US S \n",
|
|||
|
"3 CA M \n",
|
|||
|
"4 CA M "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
|
|||
|
"\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<h2>Бизнес-цели</h2>\n",
|
|||
|
"Задача регрессии: Построить модель для прогноза зарплаты в USD используя атрибуты.<br>\n",
|
|||
|
"Задача классификации: Определение уровня опыта сотрудника (experience_level) на основе других характеристик, таких как job_title, salary_in_usd, и employment_type."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проведем обработку данных и сделаем выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Transformed feature shape (train): (2029, 151)\n",
|
|||
|
"Transformed feature shape (test): (508, 151)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.impute import SimpleImputer\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|||
|
"\n",
|
|||
|
"# Удаление выбросов из столбца `salary_in_usd` с использованием IQR\n",
|
|||
|
"Q1 = df['salary_in_usd'].quantile(0.25)\n",
|
|||
|
"Q3 = df['salary_in_usd'].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"df = df[(df['salary_in_usd'] >= Q1 - 1.5 * IQR) & (df['salary_in_usd'] <= Q3 + 1.5 * IQR)]\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных данных в числовые (если потребуется)\n",
|
|||
|
"if 'remote_ratio' in df.columns:\n",
|
|||
|
" df['remote_ratio'] = df['remote_ratio'].astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление дубликатов\n",
|
|||
|
"df.drop_duplicates(inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"# Определение целевой переменной и признаков\n",
|
|||
|
"X = df.drop(columns=['salary_in_usd', 'salary_currency', 'job_title']) # Признаки\n",
|
|||
|
"y = df['salary_in_usd'] # Целевая переменная для регрессии\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых и категориальных признаков\n",
|
|||
|
"numeric_features = ['work_year', 'remote_ratio']\n",
|
|||
|
"categorical_features = ['experience_level', 'employment_type', \n",
|
|||
|
" 'employee_residence', 'company_location', 'company_size']\n",
|
|||
|
"\n",
|
|||
|
"# Обработка числовых данных\n",
|
|||
|
"numeric_transformer = Pipeline(steps=[\n",
|
|||
|
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
|
|||
|
" ('scaler', StandardScaler()) # Нормализация данных\n",
|
|||
|
"])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка категориальных данных\n",
|
|||
|
"categorical_transformer = Pipeline(steps=[\n",
|
|||
|
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
|
|||
|
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
|
|||
|
"])\n",
|
|||
|
"\n",
|
|||
|
"# Комбинированный трансформер\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
|
|||
|
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение пайплайна\n",
|
|||
|
"X_train_transformed = preprocessor.fit_transform(X_train)\n",
|
|||
|
"X_test_transformed = preprocessor.transform(X_test)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка результата трансформации\n",
|
|||
|
"print(f\"Transformed feature shape (train): {X_train_transformed.shape}\")\n",
|
|||
|
"print(f\"Transformed feature shape (test): {X_test_transformed.shape}\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выведим результаты"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>work_year</th>\n",
|
|||
|
" <th>remote_ratio</th>\n",
|
|||
|
" <th>experience_level_EN</th>\n",
|
|||
|
" <th>experience_level_EX</th>\n",
|
|||
|
" <th>experience_level_MI</th>\n",
|
|||
|
" <th>experience_level_SE</th>\n",
|
|||
|
" <th>employment_type_CT</th>\n",
|
|||
|
" <th>employment_type_FL</th>\n",
|
|||
|
" <th>employment_type_FT</th>\n",
|
|||
|
" <th>employment_type_PT</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>company_location_SI</th>\n",
|
|||
|
" <th>company_location_SK</th>\n",
|
|||
|
" <th>company_location_TH</th>\n",
|
|||
|
" <th>company_location_TR</th>\n",
|
|||
|
" <th>company_location_UA</th>\n",
|
|||
|
" <th>company_location_US</th>\n",
|
|||
|
" <th>company_location_VN</th>\n",
|
|||
|
" <th>company_size_L</th>\n",
|
|||
|
" <th>company_size_M</th>\n",
|
|||
|
" <th>company_size_S</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>-1.747172</td>\n",
|
|||
|
" <td>1.016983</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>-1.747172</td>\n",
|
|||
|
" <td>1.016983</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>0.943539</td>\n",
|
|||
|
" <td>-1.057887</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>-0.401816</td>\n",
|
|||
|
" <td>1.016983</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>-0.401816</td>\n",
|
|||
|
" <td>-0.020452</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>5 rows × 151 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" work_year remote_ratio experience_level_EN experience_level_EX \\\n",
|
|||
|
"0 -1.747172 1.016983 0.0 0.0 \n",
|
|||
|
"1 -1.747172 1.016983 0.0 0.0 \n",
|
|||
|
"2 0.943539 -1.057887 0.0 0.0 \n",
|
|||
|
"3 -0.401816 1.016983 0.0 0.0 \n",
|
|||
|
"4 -0.401816 -0.020452 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" experience_level_MI experience_level_SE employment_type_CT \\\n",
|
|||
|
"0 0.0 1.0 0.0 \n",
|
|||
|
"1 0.0 1.0 0.0 \n",
|
|||
|
"2 0.0 1.0 0.0 \n",
|
|||
|
"3 1.0 0.0 0.0 \n",
|
|||
|
"4 1.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" employment_type_FL employment_type_FT employment_type_PT ... \\\n",
|
|||
|
"0 0.0 1.0 0.0 ... \n",
|
|||
|
"1 0.0 1.0 0.0 ... \n",
|
|||
|
"2 0.0 1.0 0.0 ... \n",
|
|||
|
"3 0.0 1.0 0.0 ... \n",
|
|||
|
"4 0.0 1.0 0.0 ... \n",
|
|||
|
"\n",
|
|||
|
" company_location_SI company_location_SK company_location_TH \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" company_location_TR company_location_UA company_location_US \\\n",
|
|||
|
"0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 1.0 \n",
|
|||
|
"2 0.0 0.0 1.0 \n",
|
|||
|
"3 0.0 0.0 1.0 \n",
|
|||
|
"4 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" company_location_VN company_size_L company_size_M company_size_S \n",
|
|||
|
"0 0.0 1.0 0.0 0.0 \n",
|
|||
|
"1 0.0 1.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 1.0 0.0 \n",
|
|||
|
"3 0.0 0.0 1.0 0.0 \n",
|
|||
|
"4 0.0 1.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 151 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Получим имена категориальных признаков после OneHotEncoder\n",
|
|||
|
"categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Объединим их с именами числовых признаков\n",
|
|||
|
"feature_names = list(numeric_features) + list(categorical_feature_names)\n",
|
|||
|
"\n",
|
|||
|
"# Создадим DataFrame для преобразованных данных\n",
|
|||
|
"X_train_transformed_df = pd.DataFrame(X_train_transformed.toarray() if hasattr(X_train_transformed, 'toarray') else X_train_transformed, columns=feature_names)\n",
|
|||
|
"\n",
|
|||
|
"# Выведем первые 5 строк обработанного набора данных\n",
|
|||
|
"X_train_transformed_df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Обучим три модели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Training LinearRegression...\n",
|
|||
|
"Training RandomForestRegressor..."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Training GradientBoostingRegressor...\n",
|
|||
|
"\n",
|
|||
|
"Model: LinearRegression\n",
|
|||
|
"Best Params: {}\n",
|
|||
|
"MAE: 35903.74761235383\n",
|
|||
|
"RMSE: 45746.92374132039\n",
|
|||
|
"R2: 0.41681042958060477\n",
|
|||
|
"\n",
|
|||
|
"Model: RandomForestRegressor\n",
|
|||
|
"Best Params: {'model__n_estimators': 100, 'model__min_samples_split': 10, 'model__max_depth': 20}\n",
|
|||
|
"MAE: 35382.49447920311\n",
|
|||
|
"RMSE: 45711.49865435396\n",
|
|||
|
"R2: 0.41771328994747514\n",
|
|||
|
"\n",
|
|||
|
"Model: GradientBoostingRegressor\n",
|
|||
|
"Best Params: {'model__n_estimators': 50, 'model__max_depth': 5, 'model__learning_rate': 0.2}\n",
|
|||
|
"MAE: 35404.55042553757\n",
|
|||
|
"RMSE: 45669.354449671955\n",
|
|||
|
"R2: 0.41878648590699374\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"from sklearn.ensemble import GradientBoostingRegressor\n",
|
|||
|
"from sklearn.model_selection import RandomizedSearchCV\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"\n",
|
|||
|
"# Задание случайного состояния\n",
|
|||
|
"random_state = 42\n",
|
|||
|
"\n",
|
|||
|
"# Модели и параметры\n",
|
|||
|
"models_regression = {\n",
|
|||
|
" \"LinearRegression\": LinearRegression(),\n",
|
|||
|
" \"RandomForestRegressor\": RandomForestRegressor(random_state=random_state),\n",
|
|||
|
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=random_state)\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"param_grids_regression = {\n",
|
|||
|
" \"LinearRegression\": {},\n",
|
|||
|
" \"RandomForestRegressor\": {\n",
|
|||
|
" 'model__n_estimators': [50, 100, 200],\n",
|
|||
|
" 'model__max_depth': [None, 10, 20],\n",
|
|||
|
" 'model__min_samples_split': [2, 5, 10]\n",
|
|||
|
" },\n",
|
|||
|
" \"GradientBoostingRegressor\": {\n",
|
|||
|
" 'model__n_estimators': [50, 100, 200],\n",
|
|||
|
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
|
|||
|
" 'model__max_depth': [3, 5, 10]\n",
|
|||
|
" }\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Результаты\n",
|
|||
|
"results_regression = {}\n",
|
|||
|
"\n",
|
|||
|
"# Перебор моделей\n",
|
|||
|
"for name, model in models_regression.items():\n",
|
|||
|
" print(f\"Training {name}...\")\n",
|
|||
|
" \n",
|
|||
|
" # Создание пайплайна\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor), # Используем уже созданный preprocessor\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" \n",
|
|||
|
" # Определение параметров для RandomizedSearchCV\n",
|
|||
|
" param_grid = param_grids_regression[name]\n",
|
|||
|
" search = RandomizedSearchCV(pipeline, param_distributions=param_grid, \n",
|
|||
|
" cv=5, scoring='neg_mean_absolute_error', \n",
|
|||
|
" n_jobs=-1, random_state=random_state, n_iter=20)\n",
|
|||
|
" search.fit(X_train, y_train)\n",
|
|||
|
" \n",
|
|||
|
" # Лучшая модель\n",
|
|||
|
" best_model = search.best_estimator_\n",
|
|||
|
" y_pred = best_model.predict(X_test)\n",
|
|||
|
" \n",
|
|||
|
" # Метрики\n",
|
|||
|
" mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
|||
|
" r2 = r2_score(y_test, y_pred)\n",
|
|||
|
" \n",
|
|||
|
" # Сохранение результатов\n",
|
|||
|
" results_regression[name] = {\n",
|
|||
|
" \"Best Params\": search.best_params_,\n",
|
|||
|
" \"MAE\": mae,\n",
|
|||
|
" \"RMSE\": rmse,\n",
|
|||
|
" \"R2\": r2\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
"# Печать результатов\n",
|
|||
|
"for name, metrics in results_regression.items():\n",
|
|||
|
" print(f\"\\nModel: {name}\")\n",
|
|||
|
" for metric, value in metrics.items():\n",
|
|||
|
" print(f\"{metric}: {value}\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_60a89_row0_col0 {\n",
|
|||
|
" background-color: #25858e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_60a89_row0_col1, #T_60a89_row1_col0 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_60a89_row0_col2 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_60a89_row1_col1 {\n",
|
|||
|
" background-color: #37b878;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_60a89_row1_col2 {\n",
|
|||
|
" background-color: #9a169f;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_60a89_row2_col0, #T_60a89_row2_col1 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_60a89_row2_col2 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_60a89\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_60a89_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
|
|||
|
" <th id=\"T_60a89_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
|
|||
|
" <th id=\"T_60a89_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_60a89_level0_row0\" class=\"row_heading level0 row0\" >GradientBoostingRegressor</th>\n",
|
|||
|
" <td id=\"T_60a89_row0_col0\" class=\"data row0 col0\" >35404.550426</td>\n",
|
|||
|
" <td id=\"T_60a89_row0_col1\" class=\"data row0 col1\" >45669.354450</td>\n",
|
|||
|
" <td id=\"T_60a89_row0_col2\" class=\"data row0 col2\" >0.418786</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_60a89_level0_row1\" class=\"row_heading level0 row1\" >RandomForestRegressor</th>\n",
|
|||
|
" <td id=\"T_60a89_row1_col0\" class=\"data row1 col0\" >35382.494479</td>\n",
|
|||
|
" <td id=\"T_60a89_row1_col1\" class=\"data row1 col1\" >45711.498654</td>\n",
|
|||
|
" <td id=\"T_60a89_row1_col2\" class=\"data row1 col2\" >0.417713</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_60a89_level0_row2\" class=\"row_heading level0 row2\" >LinearRegression</th>\n",
|
|||
|
" <td id=\"T_60a89_row2_col0\" class=\"data row2 col0\" >35903.747612</td>\n",
|
|||
|
" <td id=\"T_60a89_row2_col1\" class=\"data row2 col1\" >45746.923741</td>\n",
|
|||
|
" <td id=\"T_60a89_row2_col2\" class=\"data row2 col2\" >0.416810</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x27c059b8500>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Формирование таблицы метрик из результатов регрессионных моделей\n",
|
|||
|
"reg_metrics = pd.DataFrame.from_dict(results_regression, orient=\"index\")[\n",
|
|||
|
" [\"MAE\", \"RMSE\", \"R2\"]\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов с помощью стилизации\n",
|
|||
|
"styled_metrics = (\n",
|
|||
|
" reg_metrics.sort_values(by=\"RMSE\")\n",
|
|||
|
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
|
|||
|
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Отобразим таблицу\n",
|
|||
|
"styled_metrics"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Чтото слабоватые модели получились. Даже 50% нет, нужно попробовать улучшить данные."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Transformed feature shape (train): (1886, 41)\n",
|
|||
|
"Transformed feature shape (test): (472, 41)\n",
|
|||
|
"Training LinearRegression...\n",
|
|||
|
"Training RandomForestRegressor...\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Training GradientBoostingRegressor...\n",
|
|||
|
"\n",
|
|||
|
"Model: LinearRegression\n",
|
|||
|
"Best Params: {}\n",
|
|||
|
"MAE: 37037.92339197355\n",
|
|||
|
"RMSE: 45540.3787622507\n",
|
|||
|
"R2: 0.4345993379829327\n",
|
|||
|
"\n",
|
|||
|
"Model: RandomForestRegressor\n",
|
|||
|
"Best Params: {'model__n_estimators': 300, 'model__min_samples_split': 10, 'model__max_depth': 10}\n",
|
|||
|
"MAE: 36990.476148447306\n",
|
|||
|
"RMSE: 45909.387319133624\n",
|
|||
|
"R2: 0.4253994599029146\n",
|
|||
|
"\n",
|
|||
|
"Model: GradientBoostingRegressor\n",
|
|||
|
"Best Params: {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.1}\n",
|
|||
|
"MAE: 37119.53085174526\n",
|
|||
|
"RMSE: 45945.52202948378\n",
|
|||
|
"R2: 0.42449458199205714\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style type=\"text/css\">\n",
|
|||
|
"#T_d0b41_row0_col0 {\n",
|
|||
|
" background-color: #22a785;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_d0b41_row0_col1, #T_d0b41_row1_col0 {\n",
|
|||
|
" background-color: #26818e;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_d0b41_row0_col2 {\n",
|
|||
|
" background-color: #da5a6a;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_d0b41_row1_col1 {\n",
|
|||
|
" background-color: #8ed645;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_d0b41_row1_col2 {\n",
|
|||
|
" background-color: #5e01a6;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"#T_d0b41_row2_col0, #T_d0b41_row2_col1 {\n",
|
|||
|
" background-color: #a8db34;\n",
|
|||
|
" color: #000000;\n",
|
|||
|
"}\n",
|
|||
|
"#T_d0b41_row2_col2 {\n",
|
|||
|
" background-color: #4e02a2;\n",
|
|||
|
" color: #f1f1f1;\n",
|
|||
|
"}\n",
|
|||
|
"</style>\n",
|
|||
|
"<table id=\"T_d0b41\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th class=\"blank level0\" > </th>\n",
|
|||
|
" <th id=\"T_d0b41_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
|
|||
|
" <th id=\"T_d0b41_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
|
|||
|
" <th id=\"T_d0b41_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_d0b41_level0_row0\" class=\"row_heading level0 row0\" >LinearRegression</th>\n",
|
|||
|
" <td id=\"T_d0b41_row0_col0\" class=\"data row0 col0\" >37037.923392</td>\n",
|
|||
|
" <td id=\"T_d0b41_row0_col1\" class=\"data row0 col1\" >45540.378762</td>\n",
|
|||
|
" <td id=\"T_d0b41_row0_col2\" class=\"data row0 col2\" >0.434599</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_d0b41_level0_row1\" class=\"row_heading level0 row1\" >RandomForestRegressor</th>\n",
|
|||
|
" <td id=\"T_d0b41_row1_col0\" class=\"data row1 col0\" >36990.476148</td>\n",
|
|||
|
" <td id=\"T_d0b41_row1_col1\" class=\"data row1 col1\" >45909.387319</td>\n",
|
|||
|
" <td id=\"T_d0b41_row1_col2\" class=\"data row1 col2\" >0.425399</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th id=\"T_d0b41_level0_row2\" class=\"row_heading level0 row2\" >GradientBoostingRegressor</th>\n",
|
|||
|
" <td id=\"T_d0b41_row2_col0\" class=\"data row2 col0\" >37119.530852</td>\n",
|
|||
|
" <td id=\"T_d0b41_row2_col1\" class=\"data row2 col1\" >45945.522029</td>\n",
|
|||
|
" <td id=\"T_d0b41_row2_col2\" class=\"data row2 col2\" >0.424495</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<pandas.io.formats.style.Styler at 0x27c00e56ae0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Функция для приведения выбросов к среднему значению\n",
|
|||
|
"def handle_outliers(df, column):\n",
|
|||
|
" Q1 = df[column].quantile(0.25)\n",
|
|||
|
" Q3 = df[column].quantile(0.75)\n",
|
|||
|
" IQR = Q3 - Q1\n",
|
|||
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
|||
|
" \n",
|
|||
|
" mean_value = df[column].mean()\n",
|
|||
|
" df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), mean_value, df[column])\n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"# Приведение выбросов в столбце `salary_in_usd` к среднему значению\n",
|
|||
|
"df = handle_outliers(df, 'salary_in_usd')\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных данных в строковые для корректной обработки\n",
|
|||
|
"if 'remote_ratio' in df.columns:\n",
|
|||
|
" df['remote_ratio'] = df['remote_ratio'].astype(str)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление дубликатов\n",
|
|||
|
"df.drop_duplicates(inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"# Определение целевой переменной и признаков\n",
|
|||
|
"X = df.drop(columns=['salary_in_usd', 'salary_currency', 'job_title']) # Признаки\n",
|
|||
|
"y = df['salary_in_usd'] # Целевая переменная для регрессии\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых и категориальных признаков\n",
|
|||
|
"numeric_features = ['work_year'] # Убрали 'remote_ratio', так как это категориальный признак\n",
|
|||
|
"categorical_features = ['experience_level', 'employment_type', \n",
|
|||
|
" 'employee_residence', 'company_location', 'company_size', 'remote_ratio']\n",
|
|||
|
"\n",
|
|||
|
"# Обработка числовых данных\n",
|
|||
|
"numeric_transformer = Pipeline(steps=[\n",
|
|||
|
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
|
|||
|
" ('scaler', StandardScaler()) # Нормализация данных\n",
|
|||
|
"])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка категориальных данных\n",
|
|||
|
"categorical_transformer = Pipeline(steps=[\n",
|
|||
|
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
|
|||
|
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
|
|||
|
"])\n",
|
|||
|
"\n",
|
|||
|
"# Комбинированный трансформер\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
|
|||
|
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение пайплайна\n",
|
|||
|
"X_train_transformed = preprocessor.fit_transform(X_train)\n",
|
|||
|
"X_test_transformed = preprocessor.transform(X_test)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка результата трансформации\n",
|
|||
|
"print(f\"Transformed feature shape (train): {X_train_transformed.shape}\")\n",
|
|||
|
"print(f\"Transformed feature shape (test): {X_test_transformed.shape}\")\n",
|
|||
|
"\n",
|
|||
|
"# Определение моделей и их параметров\n",
|
|||
|
"models = {\n",
|
|||
|
" \"LinearRegression\": LinearRegression(),\n",
|
|||
|
" \"RandomForestRegressor\": RandomForestRegressor(random_state=42),\n",
|
|||
|
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=42)\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"param_grids = {\n",
|
|||
|
" \"LinearRegression\": {},\n",
|
|||
|
" \"RandomForestRegressor\": {\n",
|
|||
|
" 'model__n_estimators': [100, 200, 300],\n",
|
|||
|
" 'model__max_depth': [10, 20, None],\n",
|
|||
|
" 'model__min_samples_split': [2, 5, 10]\n",
|
|||
|
" },\n",
|
|||
|
" \"GradientBoostingRegressor\": {\n",
|
|||
|
" 'model__n_estimators': [100, 200, 300],\n",
|
|||
|
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
|
|||
|
" 'model__max_depth': [3, 5, 7]\n",
|
|||
|
" }\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Результаты\n",
|
|||
|
"results = {}\n",
|
|||
|
"\n",
|
|||
|
"# Обучение моделей с подбором гиперпараметров\n",
|
|||
|
"for name, model in models.items():\n",
|
|||
|
" print(f\"Training {name}...\")\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" param_grid = param_grids[name]\n",
|
|||
|
" search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, \n",
|
|||
|
" scoring='neg_mean_absolute_error', n_jobs=-1, \n",
|
|||
|
" random_state=42, n_iter=20)\n",
|
|||
|
" search.fit(X_train, y_train)\n",
|
|||
|
" \n",
|
|||
|
" # Лучшая модель\n",
|
|||
|
" best_model = search.best_estimator_\n",
|
|||
|
" y_pred = best_model.predict(X_test)\n",
|
|||
|
" \n",
|
|||
|
" # Метрики\n",
|
|||
|
" mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
|||
|
" r2 = r2_score(y_test, y_pred)\n",
|
|||
|
" \n",
|
|||
|
" # Сохранение результатов\n",
|
|||
|
" results[name] = {\n",
|
|||
|
" \"Best Params\": search.best_params_,\n",
|
|||
|
" \"MAE\": mae,\n",
|
|||
|
" \"RMSE\": rmse,\n",
|
|||
|
" \"R2\": r2\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
"# Печать результатов\n",
|
|||
|
"for name, metrics in results.items():\n",
|
|||
|
" print(f\"\\nModel: {name}\")\n",
|
|||
|
" for metric, value in metrics.items():\n",
|
|||
|
" print(f\"{metric}: {value}\")\n",
|
|||
|
"\n",
|
|||
|
"# Формирование таблицы метрик из результатов регрессионных моделей\n",
|
|||
|
"reg_metrics = pd.DataFrame.from_dict(results, orient=\"index\")[\n",
|
|||
|
" [\"MAE\", \"RMSE\", \"R2\"]\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов с помощью стилизации\n",
|
|||
|
"styled_metrics = (\n",
|
|||
|
" reg_metrics.sort_values(by=\"RMSE\")\n",
|
|||
|
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
|
|||
|
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Отобразим таблицу\n",
|
|||
|
"styled_metrics"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Переписал не много код, стало чуть лучше, но не намного. Думаю для моей первой работы подойдет"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<h2>Приступим к задаче классификации</h2>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 39,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "KeyError",
|
|||
|
"evalue": "'Leather interior'",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
|||
|
"File \u001b[1;32mc:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
|
|||
|
"File \u001b[1;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
|
|||
|
"File \u001b[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
|
|||
|
"\u001b[1;31mKeyError\u001b[0m: 'Leather interior'",
|
|||
|
"\nThe above exception was the direct cause of the following exception:\n",
|
|||
|
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[39], line 14\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m accuracy_score, confusion_matrix, f1_score\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# Преобразование целевой переменной для классификации\u001b[39;00m\n\u001b[1;32m---> 14\u001b[0m y_class \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mLeather interior\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mmap({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mYes\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNo\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m0\u001b[39m}) \u001b[38;5;66;03m# Преобразуем в 0/1\u001b[39;00m\n\u001b[0;32m 15\u001b[0m X \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLeather interior\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mID\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;66;03m# Признаки\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;66;03m# Определение числовых и категориальных признаков\u001b[39;00m\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
|
|||
|
"File \u001b[1;32mc:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[0;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[0;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[0;32m 3810\u001b[0m ):\n\u001b[0;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[1;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[0;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
|
|||
|
"\u001b[1;31mKeyError\u001b[0m: 'Leather interior'"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|||
|
"from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.impute import SimpleImputer\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|||
|
"from sklearn.metrics import accuracy_score, confusion_matrix, f1_score\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование целевой переменной для классификации\n",
|
|||
|
"y_class = df['Leather interior'].map({'Yes': 1, 'No': 0}) # Преобразуем в 0/1\n",
|
|||
|
"X = df.drop(columns=['Leather interior', 'ID']) # Признаки\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых и категориальных признаков\n",
|
|||
|
"numeric_features = ['work_year'] # Пример числового признака\n",
|
|||
|
"categorical_features = ['experience_level', 'employment_type', 'employee_residence', \n",
|
|||
|
" 'company_location', 'company_size', 'remote_ratio']\n",
|
|||
|
"\n",
|
|||
|
"# Обработка числовых данных\n",
|
|||
|
"numeric_transformer = Pipeline(steps=[\n",
|
|||
|
" ('imputer', SimpleImputer(strategy='median')),\n",
|
|||
|
" ('scaler', StandardScaler())\n",
|
|||
|
"])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка категориальных данных\n",
|
|||
|
"categorical_transformer = Pipeline(steps=[\n",
|
|||
|
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
|
|||
|
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
|
|||
|
"])\n",
|
|||
|
"\n",
|
|||
|
"# Комбинированный трансформер\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', numeric_transformer, numeric_features),\n",
|
|||
|
" ('cat', categorical_transformer, categorical_features)\n",
|
|||
|
" ]\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Определение моделей и их гиперпараметров\n",
|
|||
|
"models_classification = {\n",
|
|||
|
" \"LogisticRegression\": LogisticRegression(max_iter=1000),\n",
|
|||
|
" \"RandomForestClassifier\": RandomForestClassifier(random_state=42),\n",
|
|||
|
" \"KNN\": KNeighborsClassifier()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"param_grids_classification = {\n",
|
|||
|
" \"LogisticRegression\": {\n",
|
|||
|
" 'model__C': [0.1, 1, 10]\n",
|
|||
|
" },\n",
|
|||
|
" \"RandomForestClassifier\": {\n",
|
|||
|
" \"model__n_estimators\": [100, 200, 300],\n",
|
|||
|
" \"model__max_features\": [\"sqrt\", \"log2\", None],\n",
|
|||
|
" \"model__max_depth\": [5, 10, 15, None],\n",
|
|||
|
" \"model__criterion\": [\"gini\", \"entropy\"]\n",
|
|||
|
" },\n",
|
|||
|
" \"KNN\": {\n",
|
|||
|
" 'model__n_neighbors': [3, 5, 7, 9],\n",
|
|||
|
" 'model__weights': ['uniform', 'distance']\n",
|
|||
|
" }\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Результаты\n",
|
|||
|
"results_classification = {}\n",
|
|||
|
"\n",
|
|||
|
"# Перебор моделей\n",
|
|||
|
"for name, model in models_classification.items():\n",
|
|||
|
" print(f\"Training {name}...\")\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" param_grid = param_grids_classification[name]\n",
|
|||
|
" grid_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, scoring='f1', n_jobs=-1, random_state=42)\n",
|
|||
|
" grid_search.fit(X_train_clf, y_train_clf)\n",
|
|||
|
"\n",
|
|||
|
" # Лучшая модель\n",
|
|||
|
" best_model = grid_search.best_estimator_\n",
|
|||
|
" y_pred = best_model.predict(X_test_clf)\n",
|
|||
|
"\n",
|
|||
|
" # Метрики\n",
|
|||
|
" acc = accuracy_score(y_test_clf, y_pred)\n",
|
|||
|
" f1 = f1_score(y_test_clf, y_pred)\n",
|
|||
|
"\n",
|
|||
|
" # Вычисление матрицы ошибок\n",
|
|||
|
" c_matrix = confusion_matrix(y_test_clf, y_pred)\n",
|
|||
|
"\n",
|
|||
|
" # Сохранение результатов\n",
|
|||
|
" results_classification[name] = {\n",
|
|||
|
" \"Best Params\": grid_search.best_params_,\n",
|
|||
|
" \"Accuracy\": acc,\n",
|
|||
|
" \"F1 Score\": f1,\n",
|
|||
|
" \"Confusion_matrix\": c_matrix\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
"# Печать результатов\n",
|
|||
|
"for name, metrics in results_classification.items():\n",
|
|||
|
" print(f\"\\nModel: {name}\")\n",
|
|||
|
" for metric, value in metrics.items():\n",
|
|||
|
" print(f\"{metric}: {value}\")\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|