1315 lines
120 KiB
Plaintext
1315 lines
120 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>work_year</th>\n",
|
||
" <th>experience_level</th>\n",
|
||
" <th>employment_type</th>\n",
|
||
" <th>job_title</th>\n",
|
||
" <th>salary</th>\n",
|
||
" <th>salary_currency</th>\n",
|
||
" <th>salary_in_usd</th>\n",
|
||
" <th>employee_residence</th>\n",
|
||
" <th>remote_ratio</th>\n",
|
||
" <th>company_location</th>\n",
|
||
" <th>company_size</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>SE</td>\n",
|
||
" <td>FT</td>\n",
|
||
" <td>Principal Data Scientist</td>\n",
|
||
" <td>80000</td>\n",
|
||
" <td>EUR</td>\n",
|
||
" <td>85847</td>\n",
|
||
" <td>ES</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>ES</td>\n",
|
||
" <td>L</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>MI</td>\n",
|
||
" <td>CT</td>\n",
|
||
" <td>ML Engineer</td>\n",
|
||
" <td>30000</td>\n",
|
||
" <td>USD</td>\n",
|
||
" <td>30000</td>\n",
|
||
" <td>US</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>US</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>MI</td>\n",
|
||
" <td>CT</td>\n",
|
||
" <td>ML Engineer</td>\n",
|
||
" <td>25500</td>\n",
|
||
" <td>USD</td>\n",
|
||
" <td>25500</td>\n",
|
||
" <td>US</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>US</td>\n",
|
||
" <td>S</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>SE</td>\n",
|
||
" <td>FT</td>\n",
|
||
" <td>Data Scientist</td>\n",
|
||
" <td>175000</td>\n",
|
||
" <td>USD</td>\n",
|
||
" <td>175000</td>\n",
|
||
" <td>CA</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>CA</td>\n",
|
||
" <td>M</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2023</td>\n",
|
||
" <td>SE</td>\n",
|
||
" <td>FT</td>\n",
|
||
" <td>Data Scientist</td>\n",
|
||
" <td>120000</td>\n",
|
||
" <td>USD</td>\n",
|
||
" <td>120000</td>\n",
|
||
" <td>CA</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>CA</td>\n",
|
||
" <td>M</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" work_year experience_level employment_type job_title \\\n",
|
||
"0 2023 SE FT Principal Data Scientist \n",
|
||
"1 2023 MI CT ML Engineer \n",
|
||
"2 2023 MI CT ML Engineer \n",
|
||
"3 2023 SE FT Data Scientist \n",
|
||
"4 2023 SE FT Data Scientist \n",
|
||
"\n",
|
||
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
|
||
"0 80000 EUR 85847 ES 100 \n",
|
||
"1 30000 USD 30000 US 100 \n",
|
||
"2 25500 USD 25500 US 100 \n",
|
||
"3 175000 USD 175000 CA 100 \n",
|
||
"4 120000 USD 120000 CA 100 \n",
|
||
"\n",
|
||
" company_location company_size \n",
|
||
"0 ES L \n",
|
||
"1 US S \n",
|
||
"2 US S \n",
|
||
"3 CA M \n",
|
||
"4 CA M "
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
|
||
"\n",
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"<h2>Бизнес-цели</h2>\n",
|
||
"Задача регрессии: Построить модель для прогноза зарплаты в USD используя атрибуты.<br>\n",
|
||
"Задача классификации: Определение уровня опыта сотрудника (experience_level) на основе других характеристик, таких как job_title, salary_in_usd, и employment_type."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проведем обработку данных и сделаем выборки"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Transformed feature shape (train): (2029, 151)\n",
|
||
"Transformed feature shape (test): (508, 151)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.impute import SimpleImputer\n",
|
||
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
||
"\n",
|
||
"# Удаление выбросов из столбца `salary_in_usd` с использованием IQR\n",
|
||
"Q1 = df['salary_in_usd'].quantile(0.25)\n",
|
||
"Q3 = df['salary_in_usd'].quantile(0.75)\n",
|
||
"IQR = Q3 - Q1\n",
|
||
"df = df[(df['salary_in_usd'] >= Q1 - 1.5 * IQR) & (df['salary_in_usd'] <= Q3 + 1.5 * IQR)]\n",
|
||
"\n",
|
||
"# Преобразование категориальных данных в числовые (если потребуется)\n",
|
||
"if 'remote_ratio' in df.columns:\n",
|
||
" df['remote_ratio'] = df['remote_ratio'].astype(int)\n",
|
||
"\n",
|
||
"# Удаление дубликатов\n",
|
||
"df.drop_duplicates(inplace=True)\n",
|
||
"\n",
|
||
"# Определение целевой переменной и признаков\n",
|
||
"X = df.drop(columns=['salary_in_usd', 'salary_currency', 'job_title']) # Признаки\n",
|
||
"y = df['salary_in_usd'] # Целевая переменная для регрессии\n",
|
||
"\n",
|
||
"# Определение числовых и категориальных признаков\n",
|
||
"numeric_features = ['work_year', 'remote_ratio']\n",
|
||
"categorical_features = ['experience_level', 'employment_type', \n",
|
||
" 'employee_residence', 'company_location', 'company_size']\n",
|
||
"\n",
|
||
"# Обработка числовых данных\n",
|
||
"numeric_transformer = Pipeline(steps=[\n",
|
||
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
|
||
" ('scaler', StandardScaler()) # Нормализация данных\n",
|
||
"])\n",
|
||
"\n",
|
||
"# Обработка категориальных данных\n",
|
||
"categorical_transformer = Pipeline(steps=[\n",
|
||
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
|
||
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
|
||
"])\n",
|
||
"\n",
|
||
"# Комбинированный трансформер\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
|
||
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"# Разделение данных на обучающую и тестовую выборки\n",
|
||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Применение пайплайна\n",
|
||
"X_train_transformed = preprocessor.fit_transform(X_train)\n",
|
||
"X_test_transformed = preprocessor.transform(X_test)\n",
|
||
"\n",
|
||
"# Проверка результата трансформации\n",
|
||
"print(f\"Transformed feature shape (train): {X_train_transformed.shape}\")\n",
|
||
"print(f\"Transformed feature shape (test): {X_test_transformed.shape}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выведим результаты"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 54,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>work_year</th>\n",
|
||
" <th>remote_ratio</th>\n",
|
||
" <th>experience_level_EN</th>\n",
|
||
" <th>experience_level_EX</th>\n",
|
||
" <th>experience_level_MI</th>\n",
|
||
" <th>experience_level_SE</th>\n",
|
||
" <th>employment_type_CT</th>\n",
|
||
" <th>employment_type_FL</th>\n",
|
||
" <th>employment_type_FT</th>\n",
|
||
" <th>employment_type_PT</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>company_location_SI</th>\n",
|
||
" <th>company_location_SK</th>\n",
|
||
" <th>company_location_TH</th>\n",
|
||
" <th>company_location_TR</th>\n",
|
||
" <th>company_location_UA</th>\n",
|
||
" <th>company_location_US</th>\n",
|
||
" <th>company_location_VN</th>\n",
|
||
" <th>company_size_L</th>\n",
|
||
" <th>company_size_M</th>\n",
|
||
" <th>company_size_S</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>-1.747172</td>\n",
|
||
" <td>1.016983</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>-1.747172</td>\n",
|
||
" <td>1.016983</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.943539</td>\n",
|
||
" <td>-1.057887</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>-0.401816</td>\n",
|
||
" <td>1.016983</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>-0.401816</td>\n",
|
||
" <td>-0.020452</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 151 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" work_year remote_ratio experience_level_EN experience_level_EX \\\n",
|
||
"0 -1.747172 1.016983 0.0 0.0 \n",
|
||
"1 -1.747172 1.016983 0.0 0.0 \n",
|
||
"2 0.943539 -1.057887 0.0 0.0 \n",
|
||
"3 -0.401816 1.016983 0.0 0.0 \n",
|
||
"4 -0.401816 -0.020452 0.0 0.0 \n",
|
||
"\n",
|
||
" experience_level_MI experience_level_SE employment_type_CT \\\n",
|
||
"0 0.0 1.0 0.0 \n",
|
||
"1 0.0 1.0 0.0 \n",
|
||
"2 0.0 1.0 0.0 \n",
|
||
"3 1.0 0.0 0.0 \n",
|
||
"4 1.0 0.0 0.0 \n",
|
||
"\n",
|
||
" employment_type_FL employment_type_FT employment_type_PT ... \\\n",
|
||
"0 0.0 1.0 0.0 ... \n",
|
||
"1 0.0 1.0 0.0 ... \n",
|
||
"2 0.0 1.0 0.0 ... \n",
|
||
"3 0.0 1.0 0.0 ... \n",
|
||
"4 0.0 1.0 0.0 ... \n",
|
||
"\n",
|
||
" company_location_SI company_location_SK company_location_TH \\\n",
|
||
"0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" company_location_TR company_location_UA company_location_US \\\n",
|
||
"0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 1.0 \n",
|
||
"2 0.0 0.0 1.0 \n",
|
||
"3 0.0 0.0 1.0 \n",
|
||
"4 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" company_location_VN company_size_L company_size_M company_size_S \n",
|
||
"0 0.0 1.0 0.0 0.0 \n",
|
||
"1 0.0 1.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 1.0 0.0 \n",
|
||
"3 0.0 0.0 1.0 0.0 \n",
|
||
"4 0.0 1.0 0.0 0.0 \n",
|
||
"\n",
|
||
"[5 rows x 151 columns]"
|
||
]
|
||
},
|
||
"execution_count": 54,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Получим имена категориальных признаков после OneHotEncoder\n",
|
||
"categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)\n",
|
||
"\n",
|
||
"# Объединим их с именами числовых признаков\n",
|
||
"feature_names = list(numeric_features) + list(categorical_feature_names)\n",
|
||
"\n",
|
||
"# Создадим DataFrame для преобразованных данных\n",
|
||
"X_train_transformed_df = pd.DataFrame(X_train_transformed.toarray() if hasattr(X_train_transformed, 'toarray') else X_train_transformed, columns=feature_names)\n",
|
||
"\n",
|
||
"# Выведем первые 5 строк обработанного набора данных\n",
|
||
"X_train_transformed_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Обучим три модели"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 55,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training LinearRegression...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training RandomForestRegressor...\n",
|
||
"Training GradientBoostingRegressor...\n",
|
||
"\n",
|
||
"Model: LinearRegression\n",
|
||
"Best Params: {}\n",
|
||
"MAE: 35903.74761235383\n",
|
||
"RMSE: 45746.92374132039\n",
|
||
"R2: 0.41681042958060477\n",
|
||
"\n",
|
||
"Model: RandomForestRegressor\n",
|
||
"Best Params: {'model__n_estimators': 100, 'model__min_samples_split': 10, 'model__max_depth': 20}\n",
|
||
"MAE: 35382.49447920311\n",
|
||
"RMSE: 45711.49865435396\n",
|
||
"R2: 0.41771328994747514\n",
|
||
"\n",
|
||
"Model: GradientBoostingRegressor\n",
|
||
"Best Params: {'model__n_estimators': 50, 'model__max_depth': 5, 'model__learning_rate': 0.2}\n",
|
||
"MAE: 35404.55042553757\n",
|
||
"RMSE: 45669.354449671955\n",
|
||
"R2: 0.41878648590699374\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor\n",
|
||
"from sklearn.model_selection import RandomizedSearchCV\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"\n",
|
||
"# Задание случайного состояния\n",
|
||
"random_state = 42\n",
|
||
"\n",
|
||
"# Модели и параметры\n",
|
||
"models_regression = {\n",
|
||
" \"LinearRegression\": LinearRegression(),\n",
|
||
" \"RandomForestRegressor\": RandomForestRegressor(random_state=random_state),\n",
|
||
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=random_state)\n",
|
||
"}\n",
|
||
"\n",
|
||
"param_grids_regression = {\n",
|
||
" \"LinearRegression\": {},\n",
|
||
" \"RandomForestRegressor\": {\n",
|
||
" 'model__n_estimators': [50, 100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20],\n",
|
||
" 'model__min_samples_split': [2, 5, 10]\n",
|
||
" },\n",
|
||
" \"GradientBoostingRegressor\": {\n",
|
||
" 'model__n_estimators': [50, 100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
|
||
" 'model__max_depth': [3, 5, 10]\n",
|
||
" }\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Результаты\n",
|
||
"results_regression = {}\n",
|
||
"\n",
|
||
"# Перебор моделей\n",
|
||
"for name, model in models_regression.items():\n",
|
||
" print(f\"Training {name}...\")\n",
|
||
" \n",
|
||
" # Создание пайплайна\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor), # Используем уже созданный preprocessor\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" \n",
|
||
" # Определение параметров для RandomizedSearchCV\n",
|
||
" param_grid = param_grids_regression[name]\n",
|
||
" search = RandomizedSearchCV(pipeline, param_distributions=param_grid, \n",
|
||
" cv=5, scoring='neg_mean_absolute_error', \n",
|
||
" n_jobs=-1, random_state=random_state, n_iter=20)\n",
|
||
" search.fit(X_train, y_train)\n",
|
||
" \n",
|
||
" # Лучшая модель\n",
|
||
" best_model = search.best_estimator_\n",
|
||
" y_pred = best_model.predict(X_test)\n",
|
||
" \n",
|
||
" # Метрики\n",
|
||
" mae = mean_absolute_error(y_test, y_pred)\n",
|
||
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
||
" r2 = r2_score(y_test, y_pred)\n",
|
||
" \n",
|
||
" # Сохранение результатов\n",
|
||
" results_regression[name] = {\n",
|
||
" \"Best Params\": search.best_params_,\n",
|
||
" \"MAE\": mae,\n",
|
||
" \"RMSE\": rmse,\n",
|
||
" \"R2\": r2\n",
|
||
" }\n",
|
||
"\n",
|
||
"# Печать результатов\n",
|
||
"for name, metrics in results_regression.items():\n",
|
||
" print(f\"\\nModel: {name}\")\n",
|
||
" for metric, value in metrics.items():\n",
|
||
" print(f\"{metric}: {value}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 56,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<style type=\"text/css\">\n",
|
||
"#T_05579_row0_col0 {\n",
|
||
" background-color: #25858e;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_05579_row0_col1, #T_05579_row1_col0 {\n",
|
||
" background-color: #26818e;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_05579_row0_col2 {\n",
|
||
" background-color: #da5a6a;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_05579_row1_col1 {\n",
|
||
" background-color: #37b878;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_05579_row1_col2 {\n",
|
||
" background-color: #9a169f;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_05579_row2_col0, #T_05579_row2_col1 {\n",
|
||
" background-color: #a8db34;\n",
|
||
" color: #000000;\n",
|
||
"}\n",
|
||
"#T_05579_row2_col2 {\n",
|
||
" background-color: #4e02a2;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"</style>\n",
|
||
"<table id=\"T_05579\">\n",
|
||
" <thead>\n",
|
||
" <tr>\n",
|
||
" <th class=\"blank level0\" > </th>\n",
|
||
" <th id=\"T_05579_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
|
||
" <th id=\"T_05579_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
|
||
" <th id=\"T_05579_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_05579_level0_row0\" class=\"row_heading level0 row0\" >GradientBoostingRegressor</th>\n",
|
||
" <td id=\"T_05579_row0_col0\" class=\"data row0 col0\" >35404.550426</td>\n",
|
||
" <td id=\"T_05579_row0_col1\" class=\"data row0 col1\" >45669.354450</td>\n",
|
||
" <td id=\"T_05579_row0_col2\" class=\"data row0 col2\" >0.418786</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_05579_level0_row1\" class=\"row_heading level0 row1\" >RandomForestRegressor</th>\n",
|
||
" <td id=\"T_05579_row1_col0\" class=\"data row1 col0\" >35382.494479</td>\n",
|
||
" <td id=\"T_05579_row1_col1\" class=\"data row1 col1\" >45711.498654</td>\n",
|
||
" <td id=\"T_05579_row1_col2\" class=\"data row1 col2\" >0.417713</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_05579_level0_row2\" class=\"row_heading level0 row2\" >LinearRegression</th>\n",
|
||
" <td id=\"T_05579_row2_col0\" class=\"data row2 col0\" >35903.747612</td>\n",
|
||
" <td id=\"T_05579_row2_col1\" class=\"data row2 col1\" >45746.923741</td>\n",
|
||
" <td id=\"T_05579_row2_col2\" class=\"data row2 col2\" >0.416810</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n"
|
||
],
|
||
"text/plain": [
|
||
"<pandas.io.formats.style.Styler at 0x27c0be5fc50>"
|
||
]
|
||
},
|
||
"execution_count": 56,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Формирование таблицы метрик из результатов регрессионных моделей\n",
|
||
"reg_metrics = pd.DataFrame.from_dict(results_regression, orient=\"index\")[\n",
|
||
" [\"MAE\", \"RMSE\", \"R2\"]\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Визуализация результатов с помощью стилизации\n",
|
||
"styled_metrics = (\n",
|
||
" reg_metrics.sort_values(by=\"RMSE\")\n",
|
||
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
|
||
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
|
||
")\n",
|
||
"\n",
|
||
"# Отобразим таблицу\n",
|
||
"styled_metrics"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Чтото слабоватые модели получились. Даже 50% нет, нужно попробовать улучшить данные."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Transformed feature shape (train): (2029, 153)\n",
|
||
"Transformed feature shape (test): (508, 153)\n",
|
||
"Training LinearRegression...\n",
|
||
"Training RandomForestRegressor...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training GradientBoostingRegressor...\n",
|
||
"\n",
|
||
"Model: LinearRegression\n",
|
||
"Best Params: {}\n",
|
||
"MAE: 35923.393167146765\n",
|
||
"RMSE: 45787.2465103007\n",
|
||
"R2: 0.41578189344376837\n",
|
||
"\n",
|
||
"Model: RandomForestRegressor\n",
|
||
"Best Params: {'model__n_estimators': 100, 'model__min_samples_split': 10, 'model__max_depth': 20}\n",
|
||
"MAE: 35428.00841155441\n",
|
||
"RMSE: 45772.13311276274\n",
|
||
"R2: 0.41616750576805106\n",
|
||
"\n",
|
||
"Model: GradientBoostingRegressor\n",
|
||
"Best Params: {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.1}\n",
|
||
"MAE: 35575.964157916314\n",
|
||
"RMSE: 45645.593157690266\n",
|
||
"R2: 0.41939112731165484\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<style type=\"text/css\">\n",
|
||
"#T_baae7_row0_col0 {\n",
|
||
" background-color: #1fa088;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_baae7_row0_col1, #T_baae7_row1_col0 {\n",
|
||
" background-color: #26818e;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_baae7_row0_col2 {\n",
|
||
" background-color: #da5a6a;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_baae7_row1_col1 {\n",
|
||
" background-color: #89d548;\n",
|
||
" color: #000000;\n",
|
||
"}\n",
|
||
"#T_baae7_row1_col2 {\n",
|
||
" background-color: #6100a7;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_baae7_row2_col0, #T_baae7_row2_col1 {\n",
|
||
" background-color: #a8db34;\n",
|
||
" color: #000000;\n",
|
||
"}\n",
|
||
"#T_baae7_row2_col2 {\n",
|
||
" background-color: #4e02a2;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"</style>\n",
|
||
"<table id=\"T_baae7\">\n",
|
||
" <thead>\n",
|
||
" <tr>\n",
|
||
" <th class=\"blank level0\" > </th>\n",
|
||
" <th id=\"T_baae7_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
|
||
" <th id=\"T_baae7_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
|
||
" <th id=\"T_baae7_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_baae7_level0_row0\" class=\"row_heading level0 row0\" >GradientBoostingRegressor</th>\n",
|
||
" <td id=\"T_baae7_row0_col0\" class=\"data row0 col0\" >35575.964158</td>\n",
|
||
" <td id=\"T_baae7_row0_col1\" class=\"data row0 col1\" >45645.593158</td>\n",
|
||
" <td id=\"T_baae7_row0_col2\" class=\"data row0 col2\" >0.419391</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_baae7_level0_row1\" class=\"row_heading level0 row1\" >RandomForestRegressor</th>\n",
|
||
" <td id=\"T_baae7_row1_col0\" class=\"data row1 col0\" >35428.008412</td>\n",
|
||
" <td id=\"T_baae7_row1_col1\" class=\"data row1 col1\" >45772.133113</td>\n",
|
||
" <td id=\"T_baae7_row1_col2\" class=\"data row1 col2\" >0.416168</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_baae7_level0_row2\" class=\"row_heading level0 row2\" >LinearRegression</th>\n",
|
||
" <td id=\"T_baae7_row2_col0\" class=\"data row2 col0\" >35923.393167</td>\n",
|
||
" <td id=\"T_baae7_row2_col1\" class=\"data row2 col1\" >45787.246510</td>\n",
|
||
" <td id=\"T_baae7_row2_col2\" class=\"data row2 col2\" >0.415782</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n"
|
||
],
|
||
"text/plain": [
|
||
"<pandas.io.formats.style.Styler at 0x27c0bec9850>"
|
||
]
|
||
},
|
||
"execution_count": 57,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Функция для приведения выбросов к среднему значению\n",
|
||
"def handle_outliers(df, column):\n",
|
||
" Q1 = df[column].quantile(0.25)\n",
|
||
" Q3 = df[column].quantile(0.75)\n",
|
||
" IQR = Q3 - Q1\n",
|
||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||
" \n",
|
||
" mean_value = df[column].mean()\n",
|
||
" df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), mean_value, df[column])\n",
|
||
" return df\n",
|
||
"\n",
|
||
"# Приведение выбросов в столбце `salary_in_usd` к среднему значению\n",
|
||
"df = handle_outliers(df, 'salary_in_usd')\n",
|
||
"\n",
|
||
"# Преобразование категориальных данных в строковые для корректной обработки\n",
|
||
"if 'remote_ratio' in df.columns:\n",
|
||
" df['remote_ratio'] = df['remote_ratio'].astype(str)\n",
|
||
"\n",
|
||
"# Удаление дубликатов\n",
|
||
"df.drop_duplicates(inplace=True)\n",
|
||
"\n",
|
||
"# Определение целевой переменной и признаков\n",
|
||
"X = df.drop(columns=['salary_in_usd', 'salary_currency', 'job_title']) # Признаки\n",
|
||
"y = df['salary_in_usd'] # Целевая переменная для регрессии\n",
|
||
"\n",
|
||
"# Определение числовых и категориальных признаков\n",
|
||
"numeric_features = ['work_year'] # Убрали 'remote_ratio', так как это категориальный признак\n",
|
||
"categorical_features = ['experience_level', 'employment_type', \n",
|
||
" 'employee_residence', 'company_location', 'company_size', 'remote_ratio']\n",
|
||
"\n",
|
||
"# Обработка числовых данных\n",
|
||
"numeric_transformer = Pipeline(steps=[\n",
|
||
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
|
||
" ('scaler', StandardScaler()) # Нормализация данных\n",
|
||
"])\n",
|
||
"\n",
|
||
"# Обработка категориальных данных\n",
|
||
"categorical_transformer = Pipeline(steps=[\n",
|
||
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
|
||
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
|
||
"])\n",
|
||
"\n",
|
||
"# Комбинированный трансформер\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
|
||
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
|
||
" ]\n",
|
||
")\n",
|
||
"\n",
|
||
"# Разделение данных на обучающую и тестовую выборки\n",
|
||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Применение пайплайна\n",
|
||
"X_train_transformed = preprocessor.fit_transform(X_train)\n",
|
||
"X_test_transformed = preprocessor.transform(X_test)\n",
|
||
"\n",
|
||
"# Проверка результата трансформации\n",
|
||
"print(f\"Transformed feature shape (train): {X_train_transformed.shape}\")\n",
|
||
"print(f\"Transformed feature shape (test): {X_test_transformed.shape}\")\n",
|
||
"\n",
|
||
"# Определение моделей и их параметров\n",
|
||
"models = {\n",
|
||
" \"LinearRegression\": LinearRegression(),\n",
|
||
" \"RandomForestRegressor\": RandomForestRegressor(random_state=42),\n",
|
||
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=42)\n",
|
||
"}\n",
|
||
"\n",
|
||
"param_grids = {\n",
|
||
" \"LinearRegression\": {},\n",
|
||
" \"RandomForestRegressor\": {\n",
|
||
" 'model__n_estimators': [100, 200, 300],\n",
|
||
" 'model__max_depth': [10, 20, None],\n",
|
||
" 'model__min_samples_split': [2, 5, 10]\n",
|
||
" },\n",
|
||
" \"GradientBoostingRegressor\": {\n",
|
||
" 'model__n_estimators': [100, 200, 300],\n",
|
||
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
|
||
" 'model__max_depth': [3, 5, 7]\n",
|
||
" }\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Результаты\n",
|
||
"results = {}\n",
|
||
"\n",
|
||
"# Обучение моделей с подбором гиперпараметров\n",
|
||
"for name, model in models.items():\n",
|
||
" print(f\"Training {name}...\")\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" param_grid = param_grids[name]\n",
|
||
" search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, \n",
|
||
" scoring='neg_mean_absolute_error', n_jobs=-1, \n",
|
||
" random_state=42, n_iter=20)\n",
|
||
" search.fit(X_train, y_train)\n",
|
||
" \n",
|
||
" # Лучшая модель\n",
|
||
" best_model = search.best_estimator_\n",
|
||
" y_pred = best_model.predict(X_test)\n",
|
||
" \n",
|
||
" # Метрики\n",
|
||
" mae = mean_absolute_error(y_test, y_pred)\n",
|
||
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
||
" r2 = r2_score(y_test, y_pred)\n",
|
||
" \n",
|
||
" # Сохранение результатов\n",
|
||
" results[name] = {\n",
|
||
" \"Best Params\": search.best_params_,\n",
|
||
" \"MAE\": mae,\n",
|
||
" \"RMSE\": rmse,\n",
|
||
" \"R2\": r2\n",
|
||
" }\n",
|
||
"\n",
|
||
"# Печать результатов\n",
|
||
"for name, metrics in results.items():\n",
|
||
" print(f\"\\nModel: {name}\")\n",
|
||
" for metric, value in metrics.items():\n",
|
||
" print(f\"{metric}: {value}\")\n",
|
||
"\n",
|
||
"# Формирование таблицы метрик из результатов регрессионных моделей\n",
|
||
"reg_metrics = pd.DataFrame.from_dict(results, orient=\"index\")[\n",
|
||
" [\"MAE\", \"RMSE\", \"R2\"]\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Визуализация результатов с помощью стилизации\n",
|
||
"styled_metrics = (\n",
|
||
" reg_metrics.sort_values(by=\"RMSE\")\n",
|
||
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
|
||
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
|
||
")\n",
|
||
"\n",
|
||
"# Отобразим таблицу\n",
|
||
"styled_metrics"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Переписал не много код, стало чуть лучше, но не намного. Думаю для моей первой работы подойдет"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"<h2>Приступим к задаче классификации</h2>"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 58,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training LogisticRegression...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 3 is smaller than n_iter=10. Running 3 iterations. For exhaustive searches, use GridSearchCV.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Training RandomForestClassifier...\n",
|
||
"Training KNN...\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 8 is smaller than n_iter=10. Running 8 iterations. For exhaustive searches, use GridSearchCV.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"Model: LogisticRegression\n",
|
||
"Best Params: {'model__C': 0.1}\n",
|
||
"Accuracy: 1.0\n",
|
||
"F1 Score: 1.0\n",
|
||
"Confusion_matrix: [[ 50 0 0 0]\n",
|
||
" [ 0 129 0 0]\n",
|
||
" [ 0 0 316 0]\n",
|
||
" [ 0 0 0 13]]\n",
|
||
"\n",
|
||
"Model: RandomForestClassifier\n",
|
||
"Best Params: {'model__n_estimators': 300, 'model__max_features': None, 'model__max_depth': 15, 'model__criterion': 'entropy'}\n",
|
||
"Accuracy: 1.0\n",
|
||
"F1 Score: 1.0\n",
|
||
"Confusion_matrix: [[ 50 0 0 0]\n",
|
||
" [ 0 129 0 0]\n",
|
||
" [ 0 0 316 0]\n",
|
||
" [ 0 0 0 13]]\n",
|
||
"\n",
|
||
"Model: KNN\n",
|
||
"Best Params: {'model__weights': 'distance', 'model__n_neighbors': 9}\n",
|
||
"Accuracy: 0.9940944881889764\n",
|
||
"F1 Score: 0.9641274132899506\n",
|
||
"Confusion_matrix: [[ 50 0 0 0]\n",
|
||
" [ 0 129 0 0]\n",
|
||
" [ 0 0 316 0]\n",
|
||
" [ 1 0 2 10]]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||
"from sklearn.metrics import accuracy_score, confusion_matrix, f1_score\n",
|
||
"\n",
|
||
"# Удалить строки с пропусками в experience_level\n",
|
||
"df = df.dropna(subset=['experience_level'])\n",
|
||
"\n",
|
||
"# Пересоздать y_class\n",
|
||
"y_class = df['experience_level'].map({'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3})\n",
|
||
"\n",
|
||
"# Повторное разделение данных\n",
|
||
"X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Определение моделей и их гиперпараметров\n",
|
||
"models_classification = {\n",
|
||
" \"LogisticRegression\": LogisticRegression(max_iter=1000),\n",
|
||
" \"RandomForestClassifier\": RandomForestClassifier(random_state=42),\n",
|
||
" \"KNN\": KNeighborsClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"param_grids_classification = {\n",
|
||
" \"LogisticRegression\": {\n",
|
||
" 'model__C': [0.1, 1, 10]\n",
|
||
" },\n",
|
||
" \"RandomForestClassifier\": {\n",
|
||
" \"model__n_estimators\": [100, 200, 300],\n",
|
||
" \"model__max_features\": [\"sqrt\", \"log2\", None],\n",
|
||
" \"model__max_depth\": [5, 10, 15, None],\n",
|
||
" \"model__criterion\": [\"gini\", \"entropy\"]\n",
|
||
" },\n",
|
||
" \"KNN\": {\n",
|
||
" 'model__n_neighbors': [3, 5, 7, 9],\n",
|
||
" 'model__weights': ['uniform', 'distance']\n",
|
||
" }\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Результаты\n",
|
||
"results_classification = {}\n",
|
||
"\n",
|
||
"# Перебор моделей\n",
|
||
"for name, model in models_classification.items():\n",
|
||
" print(f\"Training {name}...\")\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" param_grid = param_grids_classification[name]\n",
|
||
" grid_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, scoring='f1_macro', n_jobs=-1, random_state=42)\n",
|
||
" grid_search.fit(X_train_clf, y_train_clf)\n",
|
||
"\n",
|
||
" # Лучшая модель\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred = best_model.predict(X_test_clf)\n",
|
||
"\n",
|
||
" # Метрики\n",
|
||
" acc = accuracy_score(y_test_clf, y_pred)\n",
|
||
" f1 = f1_score(y_test_clf, y_pred, average='macro')\n",
|
||
"\n",
|
||
" # Вычисление матрицы ошибок\n",
|
||
" c_matrix = confusion_matrix(y_test_clf, y_pred)\n",
|
||
"\n",
|
||
" # Сохранение результатов\n",
|
||
" results_classification[name] = {\n",
|
||
" \"Best Params\": grid_search.best_params_,\n",
|
||
" \"Accuracy\": acc,\n",
|
||
" \"F1 Score\": f1,\n",
|
||
" \"Confusion_matrix\": c_matrix\n",
|
||
" }\n",
|
||
"\n",
|
||
"# Печать результатов\n",
|
||
"for name, metrics in results_classification.items():\n",
|
||
" print(f\"\\nModel: {name}\")\n",
|
||
" for metric, value in metrics.items():\n",
|
||
" print(f\"{metric}: {value}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Вывод: Все три модели показывают высокую точность, что указывает на их способность хорошо справляться с задачей классификации на данном наборе данных."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Нарисуем матрицу ошибок"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 59,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x1000 with 7 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.metrics import ConfusionMatrixDisplay\n",
|
||
"\n",
|
||
"# Визуализация матриц ошибок\n",
|
||
"num_models = len(results_classification)\n",
|
||
"num_rows = (num_models // 2) + (num_models % 2) # Количество строк для подграфиков\n",
|
||
"fig, ax = plt.subplots(num_rows, 2, figsize=(12, 10), sharex=False, sharey=False)\n",
|
||
"\n",
|
||
"for index, (name, metrics) in enumerate(results_classification.items()):\n",
|
||
" c_matrix = metrics[\"Confusion_matrix\"]\n",
|
||
" disp = ConfusionMatrixDisplay(\n",
|
||
" confusion_matrix=c_matrix, display_labels=['EN', 'MI', 'SE', 'EX']\n",
|
||
" ).plot(ax=ax.flat[index])\n",
|
||
" disp.ax_.set_title(name)\n",
|
||
"\n",
|
||
"# Корректировка расположения графиков\n",
|
||
"plt.subplots_adjust(top=0.9, bottom=0.1, hspace=0.4, wspace=0.3)\n",
|
||
"plt.show()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Формируем таблицу метрик классификации"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 60,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<style type=\"text/css\">\n",
|
||
"#T_3cbcd_row0_col0, #T_3cbcd_row1_col0 {\n",
|
||
" background-color: #21918c;\n",
|
||
" color: #f1f1f1;\n",
|
||
" background-color: #da5a6a;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_3cbcd_row0_col1, #T_3cbcd_row1_col1 {\n",
|
||
" background-color: #21918c;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_3cbcd_row2_col0 {\n",
|
||
" background-color: #440154;\n",
|
||
" color: #f1f1f1;\n",
|
||
" background-color: #4e02a2;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"#T_3cbcd_row2_col1 {\n",
|
||
" background-color: #440154;\n",
|
||
" color: #f1f1f1;\n",
|
||
"}\n",
|
||
"</style>\n",
|
||
"<table id=\"T_3cbcd\">\n",
|
||
" <thead>\n",
|
||
" <tr>\n",
|
||
" <th class=\"blank level0\" > </th>\n",
|
||
" <th id=\"T_3cbcd_level0_col0\" class=\"col_heading level0 col0\" >Accuracy</th>\n",
|
||
" <th id=\"T_3cbcd_level0_col1\" class=\"col_heading level0 col1\" >F1 Score</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_3cbcd_level0_row0\" class=\"row_heading level0 row0\" >LogisticRegression</th>\n",
|
||
" <td id=\"T_3cbcd_row0_col0\" class=\"data row0 col0\" >1.000000</td>\n",
|
||
" <td id=\"T_3cbcd_row0_col1\" class=\"data row0 col1\" >1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_3cbcd_level0_row1\" class=\"row_heading level0 row1\" >RandomForestClassifier</th>\n",
|
||
" <td id=\"T_3cbcd_row1_col0\" class=\"data row1 col0\" >1.000000</td>\n",
|
||
" <td id=\"T_3cbcd_row1_col1\" class=\"data row1 col1\" >1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th id=\"T_3cbcd_level0_row2\" class=\"row_heading level0 row2\" >KNN</th>\n",
|
||
" <td id=\"T_3cbcd_row2_col0\" class=\"data row2 col0\" >0.994094</td>\n",
|
||
" <td id=\"T_3cbcd_row2_col1\" class=\"data row2 col1\" >0.964127</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n"
|
||
],
|
||
"text/plain": [
|
||
"<pandas.io.formats.style.Styler at 0x27c0bf32b40>"
|
||
]
|
||
},
|
||
"execution_count": 60,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Формируем таблицу метрик классификации\n",
|
||
"clf_metrics = pd.DataFrame.from_dict(results_classification, orient=\"index\")[[\"Accuracy\", \"F1 Score\"]]\n",
|
||
"\n",
|
||
"# Визуализация результатов с помощью стилизации\n",
|
||
"styled_metrics_clf = (\n",
|
||
" clf_metrics.sort_values(by=\"F1 Score\", ascending=False) # Сортировка по F1 Score\n",
|
||
" .style.background_gradient(cmap=\"viridis\", low=0, high=1, subset=[\"F1 Score\", \"Accuracy\"]) # Стилизация столбцов\n",
|
||
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"Accuracy\"])\n",
|
||
")\n",
|
||
"\n",
|
||
"styled_metrics_clf"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"В итоге RandomForestClassifier и LogisticRegression выдали точность в 100% что я считаю это очень хорошо, но я не уверен что это правда. KNN очень близко."
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "aimenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|