1159 lines
51 KiB
Plaintext
Raw Normal View History

2024-12-12 16:14:13 +04:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>experience_level</th>\n",
" <th>employment_type</th>\n",
" <th>job_title</th>\n",
" <th>salary</th>\n",
" <th>salary_currency</th>\n",
" <th>salary_in_usd</th>\n",
" <th>employee_residence</th>\n",
" <th>remote_ratio</th>\n",
" <th>company_location</th>\n",
" <th>company_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Principal Data Scientist</td>\n",
" <td>80000</td>\n",
" <td>EUR</td>\n",
" <td>85847</td>\n",
" <td>ES</td>\n",
" <td>100</td>\n",
" <td>ES</td>\n",
" <td>L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>CT</td>\n",
" <td>ML Engineer</td>\n",
" <td>30000</td>\n",
" <td>USD</td>\n",
" <td>30000</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023</td>\n",
" <td>MI</td>\n",
" <td>CT</td>\n",
" <td>ML Engineer</td>\n",
" <td>25500</td>\n",
" <td>USD</td>\n",
" <td>25500</td>\n",
" <td>US</td>\n",
" <td>100</td>\n",
" <td>US</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>175000</td>\n",
" <td>USD</td>\n",
" <td>175000</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023</td>\n",
" <td>SE</td>\n",
" <td>FT</td>\n",
" <td>Data Scientist</td>\n",
" <td>120000</td>\n",
" <td>USD</td>\n",
" <td>120000</td>\n",
" <td>CA</td>\n",
" <td>100</td>\n",
" <td>CA</td>\n",
" <td>M</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" work_year experience_level employment_type job_title \\\n",
"0 2023 SE FT Principal Data Scientist \n",
"1 2023 MI CT ML Engineer \n",
"2 2023 MI CT ML Engineer \n",
"3 2023 SE FT Data Scientist \n",
"4 2023 SE FT Data Scientist \n",
"\n",
" salary salary_currency salary_in_usd employee_residence remote_ratio \\\n",
"0 80000 EUR 85847 ES 100 \n",
"1 30000 USD 30000 US 100 \n",
"2 25500 USD 25500 US 100 \n",
"3 175000 USD 175000 CA 100 \n",
"4 120000 USD 120000 CA 100 \n",
"\n",
" company_location company_size \n",
"0 ES L \n",
"1 US S \n",
"2 US S \n",
"3 CA M \n",
"4 CA M "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Бизнес-цели</h2>\n",
"Задача регрессии: Построить модель для прогноза зарплаты в USD используя атрибуты.<br>\n",
"Задача классификации: Определение уровня опыта сотрудника (experience_level) на основе других характеристик, таких как job_title, salary_in_usd, и employment_type."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проведем обработку данных и сделаем выборки"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Transformed feature shape (train): (2029, 151)\n",
"Transformed feature shape (test): (508, 151)\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"\n",
"# Удаление выбросов из столбца `salary_in_usd` с использованием IQR\n",
"Q1 = df['salary_in_usd'].quantile(0.25)\n",
"Q3 = df['salary_in_usd'].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"df = df[(df['salary_in_usd'] >= Q1 - 1.5 * IQR) & (df['salary_in_usd'] <= Q3 + 1.5 * IQR)]\n",
"\n",
"# Преобразование категориальных данных в числовые (если потребуется)\n",
"if 'remote_ratio' in df.columns:\n",
" df['remote_ratio'] = df['remote_ratio'].astype(int)\n",
"\n",
"# Удаление дубликатов\n",
"df.drop_duplicates(inplace=True)\n",
"\n",
"# Определение целевой переменной и признаков\n",
"X = df.drop(columns=['salary_in_usd', 'salary_currency', 'job_title']) # Признаки\n",
"y = df['salary_in_usd'] # Целевая переменная для регрессии\n",
"\n",
"# Определение числовых и категориальных признаков\n",
"numeric_features = ['work_year', 'remote_ratio']\n",
"categorical_features = ['experience_level', 'employment_type', \n",
" 'employee_residence', 'company_location', 'company_size']\n",
"\n",
"# Обработка числовых данных\n",
"numeric_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
" ('scaler', StandardScaler()) # Нормализация данных\n",
"])\n",
"\n",
"# Обработка категориальных данных\n",
"categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
"])\n",
"\n",
"# Комбинированный трансформер\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
" ]\n",
")\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Применение пайплайна\n",
"X_train_transformed = preprocessor.fit_transform(X_train)\n",
"X_test_transformed = preprocessor.transform(X_test)\n",
"\n",
"# Проверка результата трансформации\n",
"print(f\"Transformed feature shape (train): {X_train_transformed.shape}\")\n",
"print(f\"Transformed feature shape (test): {X_test_transformed.shape}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выведим результаты"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>work_year</th>\n",
" <th>remote_ratio</th>\n",
" <th>experience_level_EN</th>\n",
" <th>experience_level_EX</th>\n",
" <th>experience_level_MI</th>\n",
" <th>experience_level_SE</th>\n",
" <th>employment_type_CT</th>\n",
" <th>employment_type_FL</th>\n",
" <th>employment_type_FT</th>\n",
" <th>employment_type_PT</th>\n",
" <th>...</th>\n",
" <th>company_location_SI</th>\n",
" <th>company_location_SK</th>\n",
" <th>company_location_TH</th>\n",
" <th>company_location_TR</th>\n",
" <th>company_location_UA</th>\n",
" <th>company_location_US</th>\n",
" <th>company_location_VN</th>\n",
" <th>company_size_L</th>\n",
" <th>company_size_M</th>\n",
" <th>company_size_S</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-1.747172</td>\n",
" <td>1.016983</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-1.747172</td>\n",
" <td>1.016983</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.943539</td>\n",
" <td>-1.057887</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-0.401816</td>\n",
" <td>1.016983</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.401816</td>\n",
" <td>-0.020452</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 151 columns</p>\n",
"</div>"
],
"text/plain": [
" work_year remote_ratio experience_level_EN experience_level_EX \\\n",
"0 -1.747172 1.016983 0.0 0.0 \n",
"1 -1.747172 1.016983 0.0 0.0 \n",
"2 0.943539 -1.057887 0.0 0.0 \n",
"3 -0.401816 1.016983 0.0 0.0 \n",
"4 -0.401816 -0.020452 0.0 0.0 \n",
"\n",
" experience_level_MI experience_level_SE employment_type_CT \\\n",
"0 0.0 1.0 0.0 \n",
"1 0.0 1.0 0.0 \n",
"2 0.0 1.0 0.0 \n",
"3 1.0 0.0 0.0 \n",
"4 1.0 0.0 0.0 \n",
"\n",
" employment_type_FL employment_type_FT employment_type_PT ... \\\n",
"0 0.0 1.0 0.0 ... \n",
"1 0.0 1.0 0.0 ... \n",
"2 0.0 1.0 0.0 ... \n",
"3 0.0 1.0 0.0 ... \n",
"4 0.0 1.0 0.0 ... \n",
"\n",
" company_location_SI company_location_SK company_location_TH \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"\n",
" company_location_TR company_location_UA company_location_US \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 1.0 \n",
"2 0.0 0.0 1.0 \n",
"3 0.0 0.0 1.0 \n",
"4 0.0 0.0 0.0 \n",
"\n",
" company_location_VN company_size_L company_size_M company_size_S \n",
"0 0.0 1.0 0.0 0.0 \n",
"1 0.0 1.0 0.0 0.0 \n",
"2 0.0 0.0 1.0 0.0 \n",
"3 0.0 0.0 1.0 0.0 \n",
"4 0.0 1.0 0.0 0.0 \n",
"\n",
"[5 rows x 151 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Получим имена категориальных признаков после OneHotEncoder\n",
"categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)\n",
"\n",
"# Объединим их с именами числовых признаков\n",
"feature_names = list(numeric_features) + list(categorical_feature_names)\n",
"\n",
"# Создадим DataFrame для преобразованных данных\n",
"X_train_transformed_df = pd.DataFrame(X_train_transformed.toarray() if hasattr(X_train_transformed, 'toarray') else X_train_transformed, columns=feature_names)\n",
"\n",
"# Выведем первые 5 строк обработанного набора данных\n",
"X_train_transformed_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Обучим три модели"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training LinearRegression...\n",
"Training RandomForestRegressor..."
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Training GradientBoostingRegressor...\n",
"\n",
"Model: LinearRegression\n",
"Best Params: {}\n",
"MAE: 35903.74761235383\n",
"RMSE: 45746.92374132039\n",
"R2: 0.41681042958060477\n",
"\n",
"Model: RandomForestRegressor\n",
"Best Params: {'model__n_estimators': 100, 'model__min_samples_split': 10, 'model__max_depth': 20}\n",
"MAE: 35382.49447920311\n",
"RMSE: 45711.49865435396\n",
"R2: 0.41771328994747514\n",
"\n",
"Model: GradientBoostingRegressor\n",
"Best Params: {'model__n_estimators': 50, 'model__max_depth': 5, 'model__learning_rate': 0.2}\n",
"MAE: 35404.55042553757\n",
"RMSE: 45669.354449671955\n",
"R2: 0.41878648590699374\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"\n",
"# Задание случайного состояния\n",
"random_state = 42\n",
"\n",
"# Модели и параметры\n",
"models_regression = {\n",
" \"LinearRegression\": LinearRegression(),\n",
" \"RandomForestRegressor\": RandomForestRegressor(random_state=random_state),\n",
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=random_state)\n",
"}\n",
"\n",
"param_grids_regression = {\n",
" \"LinearRegression\": {},\n",
" \"RandomForestRegressor\": {\n",
" 'model__n_estimators': [50, 100, 200],\n",
" 'model__max_depth': [None, 10, 20],\n",
" 'model__min_samples_split': [2, 5, 10]\n",
" },\n",
" \"GradientBoostingRegressor\": {\n",
" 'model__n_estimators': [50, 100, 200],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 10]\n",
" }\n",
"}\n",
"\n",
"# Результаты\n",
"results_regression = {}\n",
"\n",
"# Перебор моделей\n",
"for name, model in models_regression.items():\n",
" print(f\"Training {name}...\")\n",
" \n",
" # Создание пайплайна\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor), # Используем уже созданный preprocessor\n",
" ('model', model)\n",
" ])\n",
" \n",
" # Определение параметров для RandomizedSearchCV\n",
" param_grid = param_grids_regression[name]\n",
" search = RandomizedSearchCV(pipeline, param_distributions=param_grid, \n",
" cv=5, scoring='neg_mean_absolute_error', \n",
" n_jobs=-1, random_state=random_state, n_iter=20)\n",
" search.fit(X_train, y_train)\n",
" \n",
" # Лучшая модель\n",
" best_model = search.best_estimator_\n",
" y_pred = best_model.predict(X_test)\n",
" \n",
" # Метрики\n",
" mae = mean_absolute_error(y_test, y_pred)\n",
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
" r2 = r2_score(y_test, y_pred)\n",
" \n",
" # Сохранение результатов\n",
" results_regression[name] = {\n",
" \"Best Params\": search.best_params_,\n",
" \"MAE\": mae,\n",
" \"RMSE\": rmse,\n",
" \"R2\": r2\n",
" }\n",
"\n",
"# Печать результатов\n",
"for name, metrics in results_regression.items():\n",
" print(f\"\\nModel: {name}\")\n",
" for metric, value in metrics.items():\n",
" print(f\"{metric}: {value}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_60a89_row0_col0 {\n",
" background-color: #25858e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_60a89_row0_col1, #T_60a89_row1_col0 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_60a89_row0_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_60a89_row1_col1 {\n",
" background-color: #37b878;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_60a89_row1_col2 {\n",
" background-color: #9a169f;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_60a89_row2_col0, #T_60a89_row2_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_60a89_row2_col2 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_60a89\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_60a89_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
" <th id=\"T_60a89_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
" <th id=\"T_60a89_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_60a89_level0_row0\" class=\"row_heading level0 row0\" >GradientBoostingRegressor</th>\n",
" <td id=\"T_60a89_row0_col0\" class=\"data row0 col0\" >35404.550426</td>\n",
" <td id=\"T_60a89_row0_col1\" class=\"data row0 col1\" >45669.354450</td>\n",
" <td id=\"T_60a89_row0_col2\" class=\"data row0 col2\" >0.418786</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_60a89_level0_row1\" class=\"row_heading level0 row1\" >RandomForestRegressor</th>\n",
" <td id=\"T_60a89_row1_col0\" class=\"data row1 col0\" >35382.494479</td>\n",
" <td id=\"T_60a89_row1_col1\" class=\"data row1 col1\" >45711.498654</td>\n",
" <td id=\"T_60a89_row1_col2\" class=\"data row1 col2\" >0.417713</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_60a89_level0_row2\" class=\"row_heading level0 row2\" >LinearRegression</th>\n",
" <td id=\"T_60a89_row2_col0\" class=\"data row2 col0\" >35903.747612</td>\n",
" <td id=\"T_60a89_row2_col1\" class=\"data row2 col1\" >45746.923741</td>\n",
" <td id=\"T_60a89_row2_col2\" class=\"data row2 col2\" >0.416810</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x27c059b8500>"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Формирование таблицы метрик из результатов регрессионных моделей\n",
"reg_metrics = pd.DataFrame.from_dict(results_regression, orient=\"index\")[\n",
" [\"MAE\", \"RMSE\", \"R2\"]\n",
"]\n",
"\n",
"# Визуализация результатов с помощью стилизации\n",
"styled_metrics = (\n",
" reg_metrics.sort_values(by=\"RMSE\")\n",
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
")\n",
"\n",
"# Отобразим таблицу\n",
"styled_metrics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Чтото слабоватые модели получились. Даже 50% нет, нужно попробовать улучшить данные."
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Transformed feature shape (train): (1886, 41)\n",
"Transformed feature shape (test): (472, 41)\n",
"Training LinearRegression...\n",
"Training RandomForestRegressor...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=20. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training GradientBoostingRegressor...\n",
"\n",
"Model: LinearRegression\n",
"Best Params: {}\n",
"MAE: 37037.92339197355\n",
"RMSE: 45540.3787622507\n",
"R2: 0.4345993379829327\n",
"\n",
"Model: RandomForestRegressor\n",
"Best Params: {'model__n_estimators': 300, 'model__min_samples_split': 10, 'model__max_depth': 10}\n",
"MAE: 36990.476148447306\n",
"RMSE: 45909.387319133624\n",
"R2: 0.4253994599029146\n",
"\n",
"Model: GradientBoostingRegressor\n",
"Best Params: {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.1}\n",
"MAE: 37119.53085174526\n",
"RMSE: 45945.52202948378\n",
"R2: 0.42449458199205714\n"
]
},
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_d0b41_row0_col0 {\n",
" background-color: #22a785;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_d0b41_row0_col1, #T_d0b41_row1_col0 {\n",
" background-color: #26818e;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_d0b41_row0_col2 {\n",
" background-color: #da5a6a;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_d0b41_row1_col1 {\n",
" background-color: #8ed645;\n",
" color: #000000;\n",
"}\n",
"#T_d0b41_row1_col2 {\n",
" background-color: #5e01a6;\n",
" color: #f1f1f1;\n",
"}\n",
"#T_d0b41_row2_col0, #T_d0b41_row2_col1 {\n",
" background-color: #a8db34;\n",
" color: #000000;\n",
"}\n",
"#T_d0b41_row2_col2 {\n",
" background-color: #4e02a2;\n",
" color: #f1f1f1;\n",
"}\n",
"</style>\n",
"<table id=\"T_d0b41\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_d0b41_level0_col0\" class=\"col_heading level0 col0\" >MAE</th>\n",
" <th id=\"T_d0b41_level0_col1\" class=\"col_heading level0 col1\" >RMSE</th>\n",
" <th id=\"T_d0b41_level0_col2\" class=\"col_heading level0 col2\" >R2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_d0b41_level0_row0\" class=\"row_heading level0 row0\" >LinearRegression</th>\n",
" <td id=\"T_d0b41_row0_col0\" class=\"data row0 col0\" >37037.923392</td>\n",
" <td id=\"T_d0b41_row0_col1\" class=\"data row0 col1\" >45540.378762</td>\n",
" <td id=\"T_d0b41_row0_col2\" class=\"data row0 col2\" >0.434599</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d0b41_level0_row1\" class=\"row_heading level0 row1\" >RandomForestRegressor</th>\n",
" <td id=\"T_d0b41_row1_col0\" class=\"data row1 col0\" >36990.476148</td>\n",
" <td id=\"T_d0b41_row1_col1\" class=\"data row1 col1\" >45909.387319</td>\n",
" <td id=\"T_d0b41_row1_col2\" class=\"data row1 col2\" >0.425399</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_d0b41_level0_row2\" class=\"row_heading level0 row2\" >GradientBoostingRegressor</th>\n",
" <td id=\"T_d0b41_row2_col0\" class=\"data row2 col0\" >37119.530852</td>\n",
" <td id=\"T_d0b41_row2_col1\" class=\"data row2 col1\" >45945.522029</td>\n",
" <td id=\"T_d0b41_row2_col2\" class=\"data row2 col2\" >0.424495</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x27c00e56ae0>"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Функция для приведения выбросов к среднему значению\n",
"def handle_outliers(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" mean_value = df[column].mean()\n",
" df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), mean_value, df[column])\n",
" return df\n",
"\n",
"# Приведение выбросов в столбце `salary_in_usd` к среднему значению\n",
"df = handle_outliers(df, 'salary_in_usd')\n",
"\n",
"# Преобразование категориальных данных в строковые для корректной обработки\n",
"if 'remote_ratio' in df.columns:\n",
" df['remote_ratio'] = df['remote_ratio'].astype(str)\n",
"\n",
"# Удаление дубликатов\n",
"df.drop_duplicates(inplace=True)\n",
"\n",
"# Определение целевой переменной и признаков\n",
"X = df.drop(columns=['salary_in_usd', 'salary_currency', 'job_title']) # Признаки\n",
"y = df['salary_in_usd'] # Целевая переменная для регрессии\n",
"\n",
"# Определение числовых и категориальных признаков\n",
"numeric_features = ['work_year'] # Убрали 'remote_ratio', так как это категориальный признак\n",
"categorical_features = ['experience_level', 'employment_type', \n",
" 'employee_residence', 'company_location', 'company_size', 'remote_ratio']\n",
"\n",
"# Обработка числовых данных\n",
"numeric_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')), # Заполнение пропусков медианой\n",
" ('scaler', StandardScaler()) # Нормализация данных\n",
"])\n",
"\n",
"# Обработка категориальных данных\n",
"categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')), # Заполнение пропусков модой\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore')) # Преобразование в One-Hot Encoding\n",
"])\n",
"\n",
"# Комбинированный трансформер\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, numeric_features), # Применяем числовую обработку\n",
" ('cat', categorical_transformer, categorical_features) # Применяем категориальную обработку\n",
" ]\n",
")\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Применение пайплайна\n",
"X_train_transformed = preprocessor.fit_transform(X_train)\n",
"X_test_transformed = preprocessor.transform(X_test)\n",
"\n",
"# Проверка результата трансформации\n",
"print(f\"Transformed feature shape (train): {X_train_transformed.shape}\")\n",
"print(f\"Transformed feature shape (test): {X_test_transformed.shape}\")\n",
"\n",
"# Определение моделей и их параметров\n",
"models = {\n",
" \"LinearRegression\": LinearRegression(),\n",
" \"RandomForestRegressor\": RandomForestRegressor(random_state=42),\n",
" \"GradientBoostingRegressor\": GradientBoostingRegressor(random_state=42)\n",
"}\n",
"\n",
"param_grids = {\n",
" \"LinearRegression\": {},\n",
" \"RandomForestRegressor\": {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [10, 20, None],\n",
" 'model__min_samples_split': [2, 5, 10]\n",
" },\n",
" \"GradientBoostingRegressor\": {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}\n",
"\n",
"# Результаты\n",
"results = {}\n",
"\n",
"# Обучение моделей с подбором гиперпараметров\n",
"for name, model in models.items():\n",
" print(f\"Training {name}...\")\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" param_grid = param_grids[name]\n",
" search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, \n",
" scoring='neg_mean_absolute_error', n_jobs=-1, \n",
" random_state=42, n_iter=20)\n",
" search.fit(X_train, y_train)\n",
" \n",
" # Лучшая модель\n",
" best_model = search.best_estimator_\n",
" y_pred = best_model.predict(X_test)\n",
" \n",
" # Метрики\n",
" mae = mean_absolute_error(y_test, y_pred)\n",
" rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
" r2 = r2_score(y_test, y_pred)\n",
" \n",
" # Сохранение результатов\n",
" results[name] = {\n",
" \"Best Params\": search.best_params_,\n",
" \"MAE\": mae,\n",
" \"RMSE\": rmse,\n",
" \"R2\": r2\n",
" }\n",
"\n",
"# Печать результатов\n",
"for name, metrics in results.items():\n",
" print(f\"\\nModel: {name}\")\n",
" for metric, value in metrics.items():\n",
" print(f\"{metric}: {value}\")\n",
"\n",
"# Формирование таблицы метрик из результатов регрессионных моделей\n",
"reg_metrics = pd.DataFrame.from_dict(results, orient=\"index\")[\n",
" [\"MAE\", \"RMSE\", \"R2\"]\n",
"]\n",
"\n",
"# Визуализация результатов с помощью стилизации\n",
"styled_metrics = (\n",
" reg_metrics.sort_values(by=\"RMSE\")\n",
" .style.background_gradient(cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE\", \"MAE\"])\n",
" .background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"R2\"])\n",
")\n",
"\n",
"# Отобразим таблицу\n",
"styled_metrics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Переписал не много код, стало чуть лучше, но не намного. Думаю для моей первой работы подойдет"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>Приступим к задаче классификации</h2>"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'Leather interior'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[1;32mc:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
"File \u001b[1;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mKeyError\u001b[0m: 'Leather interior'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[39], line 14\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m accuracy_score, confusion_matrix, f1_score\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# Преобразование целевой переменной для классификации\u001b[39;00m\n\u001b[1;32m---> 14\u001b[0m y_class \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mLeather interior\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mmap({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mYes\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNo\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m0\u001b[39m}) \u001b[38;5;66;03m# Преобразуем в 0/1\u001b[39;00m\n\u001b[0;32m 15\u001b[0m X \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLeather interior\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mID\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;66;03m# Признаки\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;66;03m# Определение числовых и категориальных признаков\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
"File \u001b[1;32mc:\\Users\\salih\\OneDrive\\Рабочий стол\\3 курас\\МИИ\\laba1\\AIM-PIbd-31-Yaruskin-S-A\\aimenv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[0;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[0;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[0;32m 3810\u001b[0m ):\n\u001b[0;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[1;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[0;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
"\u001b[1;31mKeyError\u001b[0m: 'Leather interior'"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, f1_score\n",
"\n",
"# Преобразование целевой переменной для классификации\n",
"y_class = df['Leather interior'].map({'Yes': 1, 'No': 0}) # Преобразуем в 0/1\n",
"X = df.drop(columns=['Leather interior', 'ID']) # Признаки\n",
"\n",
"# Определение числовых и категориальных признаков\n",
"numeric_features = ['work_year'] # Пример числового признака\n",
"categorical_features = ['experience_level', 'employment_type', 'employee_residence', \n",
" 'company_location', 'company_size', 'remote_ratio']\n",
"\n",
"# Обработка числовых данных\n",
"numeric_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
"])\n",
"\n",
"# Обработка категориальных данных\n",
"categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
"])\n",
"\n",
"# Комбинированный трансформер\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, numeric_features),\n",
" ('cat', categorical_transformer, categorical_features)\n",
" ]\n",
")\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Определение моделей и их гиперпараметров\n",
"models_classification = {\n",
" \"LogisticRegression\": LogisticRegression(max_iter=1000),\n",
" \"RandomForestClassifier\": RandomForestClassifier(random_state=42),\n",
" \"KNN\": KNeighborsClassifier()\n",
"}\n",
"\n",
"param_grids_classification = {\n",
" \"LogisticRegression\": {\n",
" 'model__C': [0.1, 1, 10]\n",
" },\n",
" \"RandomForestClassifier\": {\n",
" \"model__n_estimators\": [100, 200, 300],\n",
" \"model__max_features\": [\"sqrt\", \"log2\", None],\n",
" \"model__max_depth\": [5, 10, 15, None],\n",
" \"model__criterion\": [\"gini\", \"entropy\"]\n",
" },\n",
" \"KNN\": {\n",
" 'model__n_neighbors': [3, 5, 7, 9],\n",
" 'model__weights': ['uniform', 'distance']\n",
" }\n",
"}\n",
"\n",
"# Результаты\n",
"results_classification = {}\n",
"\n",
"# Перебор моделей\n",
"for name, model in models_classification.items():\n",
" print(f\"Training {name}...\")\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" param_grid = param_grids_classification[name]\n",
" grid_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=5, scoring='f1', n_jobs=-1, random_state=42)\n",
" grid_search.fit(X_train_clf, y_train_clf)\n",
"\n",
" # Лучшая модель\n",
" best_model = grid_search.best_estimator_\n",
" y_pred = best_model.predict(X_test_clf)\n",
"\n",
" # Метрики\n",
" acc = accuracy_score(y_test_clf, y_pred)\n",
" f1 = f1_score(y_test_clf, y_pred)\n",
"\n",
" # Вычисление матрицы ошибок\n",
" c_matrix = confusion_matrix(y_test_clf, y_pred)\n",
"\n",
" # Сохранение результатов\n",
" results_classification[name] = {\n",
" \"Best Params\": grid_search.best_params_,\n",
" \"Accuracy\": acc,\n",
" \"F1 Score\": f1,\n",
" \"Confusion_matrix\": c_matrix\n",
" }\n",
"\n",
"# Печать результатов\n",
"for name, metrics in results_classification.items():\n",
" print(f\"\\nModel: {name}\")\n",
" for metric, value in metrics.items():\n",
" print(f\"{metric}: {value}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}