Files
Владимир Данилов 8ab2d8476d Lab 4
2025-03-14 13:44:13 +04:00

635 KiB
Raw Permalink Blame History

Начало лабораторной

Выгрузка данных из csv файла в датафрейм

In [263]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
In [264]:
df = pd.read_csv(".//static//csv//ds_salaries.csv")

print("Первые 5 строк датасета:")
print(df.head())
print("\nИнформация о датасете:")
print(df.info())
Первые 5 строк датасета:
   work_year experience_level employment_type                 job_title  \
0       2023               SE              FT  Principal Data Scientist   
1       2023               MI              CT               ML Engineer   
2       2023               MI              CT               ML Engineer   
3       2023               SE              FT            Data Scientist   
4       2023               SE              FT            Data Scientist   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0   80000             EUR          85847                 ES           100   
1   30000             USD          30000                 US           100   
2   25500             USD          25500                 US           100   
3  175000             USD         175000                 CA           100   
4  120000             USD         120000                 CA           100   

  company_location company_size  
0               ES            L  
1               US            S  
2               US            S  
3               CA            M  
4               CA            M  

Информация о датасете:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
None

Бизнес цели:

Предсказание зарплаты в долларах (salary_in_usd) на основе характеристик сотрудника и компании.

Предсказание уровня опыта (experience_level) сотрудника.

Разделение данных на обучающую и тестовую выборки

In [265]:
X = df.drop(columns=["salary_in_usd"])
y = df["salary_in_usd"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Для классификации: предсказание experience_level
y_clf = df["experience_level"]
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)

Предбоработка данных

In [266]:
numeric_features = ["work_year", "salary", "remote_ratio"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ["experience_level", "employment_type", "job_title", "company_size"]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Модели для регрессии

In [267]:
models_reg = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Конвейер для регрессии
pipelines_reg = {}
for name, model in models_reg.items():
    pipelines_reg[name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

Демонстрация работы конвейера

In [268]:
preprocessing_result = pipelines_reg["Random Forest"].named_steps['preprocessor'].fit_transform(X_train)

print(preprocessing_result.shape)
print(preprocessing_result[:5])
(3004, 99)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 35 stored elements and shape (5, 99)>
  Coords	Values
  (0, 0)	-0.5306956799070746
  (0, 1)	-0.2181480307983341
  (0, 2)	-0.9659918535333452
  (0, 6)	1.0
  (0, 9)	1.0
  (0, 42)	1.0
  (0, 97)	1.0
  (1, 0)	0.9146441409823196
  (1, 1)	-0.05663343436009802
  (1, 2)	1.0905917949529544
  (1, 5)	1.0
  (1, 9)	1.0
  (1, 93)	1.0
  (1, 97)	1.0
  (2, 0)	-0.5306956799070746
  (2, 1)	-0.12826405161609655
  (2, 2)	-0.9659918535333452
  (2, 6)	1.0
  (2, 9)	1.0
  (2, 34)	1.0
  (2, 97)	1.0
  (3, 0)	-0.5306956799070746
  (3, 1)	-0.049995848205102014
  (3, 2)	1.0905917949529544
  (3, 6)	1.0
  (3, 9)	1.0
  (3, 42)	1.0
  (3, 97)	1.0
  (4, 0)	0.9146441409823196
  (4, 1)	-0.024551767944283997
  (4, 2)	1.0905917949529544
  (4, 6)	1.0
  (4, 9)	1.0
  (4, 42)	1.0
  (4, 97)	1.0
In [269]:
# Преобразуем разреженную матрицу в плотную форму
preprocessing_result_dense = preprocessing_result.toarray()

# Получаем имена признаков после предобработки
columns_reg = pipelines_reg["Random Forest"].named_steps['preprocessor'].get_feature_names_out()

# Если количество столбцов в данных совпадает с количеством в названиях
if preprocessing_result_dense.shape[1] == len(columns_reg):
    preprocessed_df = pd.DataFrame(
        preprocessing_result_dense,
        columns=columns_reg
    )
else:
    # В случае, если количество столбцов не совпадает, выводим предупреждение
    print(f"Количество столбцов в данных ({preprocessing_result_dense.shape[1]}) не совпадает с количеством в названиях ({len(columns_reg)})")
    # Можно применить нужное количество столбцов из названий
    preprocessed_df = pd.DataFrame(
        preprocessing_result_dense,
        columns=columns_reg[:preprocessing_result_dense.shape[1]]
    )

# Выводим DataFrame с предобработанными данными
preprocessed_df.head()
Out[269]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
num__work_year num__salary num__remote_ratio cat__experience_level_EN cat__experience_level_EX cat__experience_level_MI cat__experience_level_SE cat__employment_type_CT cat__employment_type_FL cat__employment_type_FT ... cat__job_title_Principal Data Scientist cat__job_title_Principal Machine Learning Engineer cat__job_title_Product Data Analyst cat__job_title_Research Engineer cat__job_title_Research Scientist cat__job_title_Software Data Engineer cat__job_title_Staff Data Scientist cat__company_size_L cat__company_size_M cat__company_size_S
0 -0.530696 -0.218148 -0.965992 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 0.914644 -0.056633 1.090592 0.0 0.0 1.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
2 -0.530696 -0.128264 -0.965992 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
3 -0.530696 -0.049996 1.090592 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 0.914644 -0.024552 1.090592 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

5 rows × 99 columns

Модели для классификации

In [270]:
models_clf = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

# Конвейер для классификации
pipelines_clf = {}
for name, model in models_clf.items():
    pipelines_clf[name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

Демонстрация работы конвейера

In [271]:
preprocessing_result = pipelines_clf["Random Forest"].named_steps['preprocessor'].fit_transform(X_train)

print("Shape of preprocessed data:", preprocessing_result.shape)
print("First 5 rows of preprocessed data:", preprocessing_result[:5])
Shape of preprocessed data: (3004, 99)
First 5 rows of preprocessed data: <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 35 stored elements and shape (5, 99)>
  Coords	Values
  (0, 0)	-0.5306956799070746
  (0, 1)	-0.2181480307983341
  (0, 2)	-0.9659918535333452
  (0, 6)	1.0
  (0, 9)	1.0
  (0, 42)	1.0
  (0, 97)	1.0
  (1, 0)	0.9146441409823196
  (1, 1)	-0.05663343436009802
  (1, 2)	1.0905917949529544
  (1, 5)	1.0
  (1, 9)	1.0
  (1, 93)	1.0
  (1, 97)	1.0
  (2, 0)	-0.5306956799070746
  (2, 1)	-0.12826405161609655
  (2, 2)	-0.9659918535333452
  (2, 6)	1.0
  (2, 9)	1.0
  (2, 34)	1.0
  (2, 97)	1.0
  (3, 0)	-0.5306956799070746
  (3, 1)	-0.049995848205102014
  (3, 2)	1.0905917949529544
  (3, 6)	1.0
  (3, 9)	1.0
  (3, 42)	1.0
  (3, 97)	1.0
  (4, 0)	0.9146441409823196
  (4, 1)	-0.024551767944283997
  (4, 2)	1.0905917949529544
  (4, 6)	1.0
  (4, 9)	1.0
  (4, 42)	1.0
  (4, 97)	1.0
In [272]:
# Преобразуем разреженную матрицу в плотную
preprocessing_result_dense = preprocessing_result.toarray()

# Получаем имена признаков после предобработки
columns_clf = pipelines_clf["Random Forest"].named_steps['preprocessor'].get_feature_names_out()

# Если количество столбцов в данных совпадает с количеством в названиях
if preprocessing_result_dense.shape[1] == len(columns_clf):
    preprocessed_df = pd.DataFrame(
        preprocessing_result_dense,
        columns=columns_clf
    )
else:
    # В случае, если количество столбцов не совпадает, выводим предупреждение
    print(f"Количество столбцов в данных ({preprocessing_result_dense.shape[1]}) не совпадает с количеством в названиях ({len(columns_clf)})")
    # Можно применить нужное количество столбцов из названий
    preprocessed_df = pd.DataFrame(
        preprocessing_result_dense,
        columns=columns_clf[:preprocessing_result_dense.shape[1]]
    )

# Выводим DataFrame с предобработанными данными
preprocessed_df.head()
Out[272]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
num__work_year num__salary num__remote_ratio cat__experience_level_EN cat__experience_level_EX cat__experience_level_MI cat__experience_level_SE cat__employment_type_CT cat__employment_type_FL cat__employment_type_FT ... cat__job_title_Principal Data Scientist cat__job_title_Principal Machine Learning Engineer cat__job_title_Product Data Analyst cat__job_title_Research Engineer cat__job_title_Research Scientist cat__job_title_Software Data Engineer cat__job_title_Staff Data Scientist cat__company_size_L cat__company_size_M cat__company_size_S
0 -0.530696 -0.218148 -0.965992 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 0.914644 -0.056633 1.090592 0.0 0.0 1.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
2 -0.530696 -0.128264 -0.965992 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
3 -0.530696 -0.049996 1.090592 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 0.914644 -0.024552 1.090592 0.0 0.0 0.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

5 rows × 99 columns

Сводная таблица оценок качества для использованных моделей классификации

In [273]:
# Подсчитываем количество строк и столбцов для подграфиков
n_models = len(pipelines_clf)
n_cols = 2
n_rows = (n_models // n_cols) + (n_models % n_cols > 0)  # Подсчитываем количество строк для подграфиков

fig, ax = plt.subplots(n_rows, n_cols, figsize=(12, 5 * n_rows), sharex=False, sharey=False)

for index, (name, pipeline) in enumerate(pipelines_clf.items()):
    # Обучаем модель
    pipeline.fit(X_train_clf, y_train_clf)
    
    # Получаем предсказания
    y_pred_clf = pipeline.predict(X_test_clf)
    
    # Строим матрицу ошибок
    c_matrix = confusion_matrix(y_test_clf, y_pred_clf)
    
    # Отображаем матрицу ошибок
    row = index // n_cols  # Строка для текущего подграфика
    col = index % n_cols   # Столбец для текущего подграфика
    disp = ConfusionMatrixDisplay(confusion_matrix=c_matrix, display_labels=np.unique(y_clf)).plot(ax=ax[row, col])
    disp.ax_.set_title(name)

# Отключаем пустые подграфики, если они есть
if n_models % n_cols != 0:
    for i in range(n_models, n_rows * n_cols):
        fig.delaxes(ax.flatten()[i])

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image

В ходе анализа работы моделей машинного обучения на задаче классификации с использованием матриц ошибок, можно отметить следующие ключевые моменты:

Logistic Regression и Random Forest показали хорошие результаты в классификации.

K-Nearest Neighbors (KNN) также демонстрирует удовлетворительные результаты. Количество правильных классификаций значительно превышает ошибки. Однако модель ошибается в классификации между классами "EN", "EX", "MI" и "SE", что проявляется в нескольких ложных отрицательных и ложных положительных результатах. Эти ошибки малозначительны.

Пример использования обученной модели для предсказания

Нстройка гиперпараметров для регрессии

In [274]:
param_grid_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20]
}

param_grid_gb = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.01, 0.1]
}

# Настройка для Random Forest
grid_search_rf = GridSearchCV(pipelines_reg["Random Forest"], param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Настройка для Gradient Boosting
grid_search_gb = GridSearchCV(pipelines_reg["Gradient Boosting"], param_grid_gb, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)
Out[274]:
<style>#sk-container-id-11 { /* Definition of color scheme common for light and dark mode */ --sklearn-color-text: black; --sklearn-color-line: gray; /* Definition of color scheme for unfitted estimators */ --sklearn-color-unfitted-level-0: #fff5e6; --sklearn-color-unfitted-level-1: #f6e4d2; --sklearn-color-unfitted-level-2: #ffe0b3; --sklearn-color-unfitted-level-3: chocolate; /* Definition of color scheme for fitted estimators */ --sklearn-color-fitted-level-0: #f0f8ff; --sklearn-color-fitted-level-1: #d4ebff; --sklearn-color-fitted-level-2: #b3dbfd; --sklearn-color-fitted-level-3: cornflowerblue; /* Specific color for light theme */ --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black))); --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white))); --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black))); --sklearn-color-icon: #696969; @media (prefers-color-scheme: dark) { /* Redefinition of color scheme for dark theme */ --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white))); --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111))); --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white))); --sklearn-color-icon: #878787; } } #sk-container-id-11 { color: var(--sklearn-color-text); } #sk-container-id-11 pre { padding: 0; } #sk-container-id-11 input.sk-hidden--visually { border: 0; clip: rect(1px 1px 1px 1px); clip: rect(1px, 1px, 1px, 1px); height: 1px; margin: -1px; overflow: hidden; padding: 0; position: absolute; width: 1px; } #sk-container-id-11 div.sk-dashed-wrapped { border: 1px dashed var(--sklearn-color-line); margin: 0 0.4em 0.5em 0.4em; box-sizing: border-box; padding-bottom: 0.4em; background-color: var(--sklearn-color-background); } #sk-container-id-11 div.sk-container { /* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */ display: inline-block !important; position: relative; } #sk-container-id-11 div.sk-text-repr-fallback { display: none; } div.sk-parallel-item, div.sk-serial, div.sk-item { /* draw centered vertical line to link estimators */ background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background)); background-size: 2px 100%; background-repeat: no-repeat; background-position: center center; } /* Parallel-specific style estimator block */ #sk-container-id-11 div.sk-parallel-item::after { content: ""; width: 100%; border-bottom: 2px solid var(--sklearn-color-text-on-default-background); flex-grow: 1; } #sk-container-id-11 div.sk-parallel { display: flex; align-items: stretch; justify-content: center; background-color: var(--sklearn-color-background); position: relative; } #sk-container-id-11 div.sk-parallel-item { display: flex; flex-direction: column; } #sk-container-id-11 div.sk-parallel-item:first-child::after { align-self: flex-end; width: 50%; } #sk-container-id-11 div.sk-parallel-item:last-child::after { align-self: flex-start; width: 50%; } #sk-container-id-11 div.sk-parallel-item:only-child::after { width: 0; } /* Serial-specific style estimator block */ #sk-container-id-11 div.sk-serial { display: flex; flex-direction: column; align-items: center; background-color: var(--sklearn-color-background); padding-right: 1em; padding-left: 1em; } /* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is clickable and can be expanded/collapsed. - Pipeline and ColumnTransformer use this feature and define the default style - Estimators will overwrite some part of the style using the `sk-estimator` class */ /* Pipeline and ColumnTransformer style (default) */ #sk-container-id-11 div.sk-toggleable { /* Default theme specific background. It is overwritten whether we have a specific estimator or a Pipeline/ColumnTransformer */ background-color: var(--sklearn-color-background); } /* Toggleable label */ #sk-container-id-11 label.sk-toggleable__label { cursor: pointer; display: block; width: 100%; margin-bottom: 0; padding: 0.5em; box-sizing: border-box; text-align: center; } #sk-container-id-11 label.sk-toggleable__label-arrow:before { /* Arrow on the left of the label */ content: "▸"; float: left; margin-right: 0.25em; color: var(--sklearn-color-icon); } #sk-container-id-11 label.sk-toggleable__label-arrow:hover:before { color: var(--sklearn-color-text); } /* Toggleable content - dropdown */ #sk-container-id-11 div.sk-toggleable__content { max-height: 0; max-width: 0; overflow: hidden; text-align: left; /* unfitted */ background-color: var(--sklearn-color-unfitted-level-0); } #sk-container-id-11 div.sk-toggleable__content.fitted { /* fitted */ background-color: var(--sklearn-color-fitted-level-0); } #sk-container-id-11 div.sk-toggleable__content pre { margin: 0.2em; border-radius: 0.25em; color: var(--sklearn-color-text); /* unfitted */ background-color: var(--sklearn-color-unfitted-level-0); } #sk-container-id-11 div.sk-toggleable__content.fitted pre { /* unfitted */ background-color: var(--sklearn-color-fitted-level-0); } #sk-container-id-11 input.sk-toggleable__control:checked~div.sk-toggleable__content { /* Expand drop-down */ max-height: 200px; max-width: 100%; overflow: auto; } #sk-container-id-11 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before { content: "▾"; } /* Pipeline/ColumnTransformer-specific style */ #sk-container-id-11 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label { color: var(--sklearn-color-text); background-color: var(--sklearn-color-unfitted-level-2); } #sk-container-id-11 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label { background-color: var(--sklearn-color-fitted-level-2); } /* Estimator-specific style */ /* Colorize estimator box */ #sk-container-id-11 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-2); } #sk-container-id-11 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label { /* fitted */ background-color: var(--sklearn-color-fitted-level-2); } #sk-container-id-11 div.sk-label label.sk-toggleable__label, #sk-container-id-11 div.sk-label label { /* The background is the default theme color */ color: var(--sklearn-color-text-on-default-background); } /* On hover, darken the color of the background */ #sk-container-id-11 div.sk-label:hover label.sk-toggleable__label { color: var(--sklearn-color-text); background-color: var(--sklearn-color-unfitted-level-2); } /* Label box, darken color on hover, fitted */ #sk-container-id-11 div.sk-label.fitted:hover label.sk-toggleable__label.fitted { color: var(--sklearn-color-text); background-color: var(--sklearn-color-fitted-level-2); } /* Estimator label */ #sk-container-id-11 div.sk-label label { font-family: monospace; font-weight: bold; display: inline-block; line-height: 1.2em; } #sk-container-id-11 div.sk-label-container { text-align: center; } /* Estimator-specific */ #sk-container-id-11 div.sk-estimator { font-family: monospace; border: 1px dotted var(--sklearn-color-border-box); border-radius: 0.25em; box-sizing: border-box; margin-bottom: 0.5em; /* unfitted */ background-color: var(--sklearn-color-unfitted-level-0); } #sk-container-id-11 div.sk-estimator.fitted { /* fitted */ background-color: var(--sklearn-color-fitted-level-0); } /* on hover */ #sk-container-id-11 div.sk-estimator:hover { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-2); } #sk-container-id-11 div.sk-estimator.fitted:hover { /* fitted */ background-color: var(--sklearn-color-fitted-level-2); } /* Specification for estimator info (e.g. "i" and "?") */ /* Common style for "i" and "?" */ .sk-estimator-doc-link, a:link.sk-estimator-doc-link, a:visited.sk-estimator-doc-link { float: right; font-size: smaller; line-height: 1em; font-family: monospace; background-color: var(--sklearn-color-background); border-radius: 1em; height: 1em; width: 1em; text-decoration: none !important; margin-left: 1ex; /* unfitted */ border: var(--sklearn-color-unfitted-level-1) 1pt solid; color: var(--sklearn-color-unfitted-level-1); } .sk-estimator-doc-link.fitted, a:link.sk-estimator-doc-link.fitted, a:visited.sk-estimator-doc-link.fitted { /* fitted */ border: var(--sklearn-color-fitted-level-1) 1pt solid; color: var(--sklearn-color-fitted-level-1); } /* On hover */ div.sk-estimator:hover .sk-estimator-doc-link:hover, .sk-estimator-doc-link:hover, div.sk-label-container:hover .sk-estimator-doc-link:hover, .sk-estimator-doc-link:hover { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-3); color: var(--sklearn-color-background); text-decoration: none; } div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover, .sk-estimator-doc-link.fitted:hover, div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover, .sk-estimator-doc-link.fitted:hover { /* fitted */ background-color: var(--sklearn-color-fitted-level-3); color: var(--sklearn-color-background); text-decoration: none; } /* Span, style for the box shown on hovering the info icon */ .sk-estimator-doc-link span { display: none; z-index: 9999; position: relative; font-weight: normal; right: .2ex; padding: .5ex; margin: .5ex; width: min-content; min-width: 20ex; max-width: 50ex; color: var(--sklearn-color-text); box-shadow: 2pt 2pt 4pt #999; /* unfitted */ background: var(--sklearn-color-unfitted-level-0); border: .5pt solid var(--sklearn-color-unfitted-level-3); } .sk-estimator-doc-link.fitted span { /* fitted */ background: var(--sklearn-color-fitted-level-0); border: var(--sklearn-color-fitted-level-3); } .sk-estimator-doc-link:hover span { display: block; } /* "?"-specific style due to the `` HTML tag */ #sk-container-id-11 a.estimator_doc_link { float: right; font-size: 1rem; line-height: 1em; font-family: monospace; background-color: var(--sklearn-color-background); border-radius: 1rem; height: 1rem; width: 1rem; text-decoration: none; /* unfitted */ color: var(--sklearn-color-unfitted-level-1); border: var(--sklearn-color-unfitted-level-1) 1pt solid; } #sk-container-id-11 a.estimator_doc_link.fitted { /* fitted */ border: var(--sklearn-color-fitted-level-1) 1pt solid; color: var(--sklearn-color-fitted-level-1); } /* On hover */ #sk-container-id-11 a.estimator_doc_link:hover { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-3); color: var(--sklearn-color-background); text-decoration: none; } #sk-container-id-11 a.estimator_doc_link.fitted:hover { /* fitted */ background-color: var(--sklearn-color-fitted-level-3); } </style>
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['work_year',
                                                                          'salary',
                                                                          'remote_ratio']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['experience_level',
                                                                          'employment_type',
                                                                          'job_title',
                                                                          'company_size'])])),
                                       ('regressor',
                                        GradientBoostingRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'regressor__learning_rate': [0.01, 0.1],
                         'regressor__n_estimators': [100, 200]},
             scoring='neg_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
?Documentation for GridSearchCViFitted
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['work_year',
                                                                          'salary',
                                                                          'remote_ratio']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['experience_level',
                                                                          'employment_type',
                                                                          'job_title',
                                                                          'company_size'])])),
                                       ('regressor',
                                        GradientBoostingRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'regressor__learning_rate': [0.01, 0.1],
                         'regressor__n_estimators': [100, 200]},
             scoring='neg_mean_squared_error')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['work_year', 'salary',
                                                   'remote_ratio']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['experience_level',
                                                   'employment_type',
                                                   'job_title',
                                                   'company_size'])])),
                ('regressor', GradientBoostingRegressor(random_state=42))])
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['work_year', 'salary', 'remote_ratio']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['experience_level', 'employment_type',
                                  'job_title', 'company_size'])])
['work_year', 'salary', 'remote_ratio']
SimpleImputer(strategy='median')
StandardScaler()
['experience_level', 'employment_type', 'job_title', 'company_size']
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore')
GradientBoostingRegressor(random_state=42)

Настрайка гиперпараметров для классификации

In [275]:
param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7]
}

# Настройка для KNN
grid_search_knn = GridSearchCV(pipelines_clf["KNN"], param_grid_knn, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_knn.fit(X_train_clf, y_train_clf)
Out[275]:
<style>#sk-container-id-12 { /* Definition of color scheme common for light and dark mode */ --sklearn-color-text: black; --sklearn-color-line: gray; /* Definition of color scheme for unfitted estimators */ --sklearn-color-unfitted-level-0: #fff5e6; --sklearn-color-unfitted-level-1: #f6e4d2; --sklearn-color-unfitted-level-2: #ffe0b3; --sklearn-color-unfitted-level-3: chocolate; /* Definition of color scheme for fitted estimators */ --sklearn-color-fitted-level-0: #f0f8ff; --sklearn-color-fitted-level-1: #d4ebff; --sklearn-color-fitted-level-2: #b3dbfd; --sklearn-color-fitted-level-3: cornflowerblue; /* Specific color for light theme */ --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black))); --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white))); --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black))); --sklearn-color-icon: #696969; @media (prefers-color-scheme: dark) { /* Redefinition of color scheme for dark theme */ --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white))); --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111))); --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white))); --sklearn-color-icon: #878787; } } #sk-container-id-12 { color: var(--sklearn-color-text); } #sk-container-id-12 pre { padding: 0; } #sk-container-id-12 input.sk-hidden--visually { border: 0; clip: rect(1px 1px 1px 1px); clip: rect(1px, 1px, 1px, 1px); height: 1px; margin: -1px; overflow: hidden; padding: 0; position: absolute; width: 1px; } #sk-container-id-12 div.sk-dashed-wrapped { border: 1px dashed var(--sklearn-color-line); margin: 0 0.4em 0.5em 0.4em; box-sizing: border-box; padding-bottom: 0.4em; background-color: var(--sklearn-color-background); } #sk-container-id-12 div.sk-container { /* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */ display: inline-block !important; position: relative; } #sk-container-id-12 div.sk-text-repr-fallback { display: none; } div.sk-parallel-item, div.sk-serial, div.sk-item { /* draw centered vertical line to link estimators */ background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background)); background-size: 2px 100%; background-repeat: no-repeat; background-position: center center; } /* Parallel-specific style estimator block */ #sk-container-id-12 div.sk-parallel-item::after { content: ""; width: 100%; border-bottom: 2px solid var(--sklearn-color-text-on-default-background); flex-grow: 1; } #sk-container-id-12 div.sk-parallel { display: flex; align-items: stretch; justify-content: center; background-color: var(--sklearn-color-background); position: relative; } #sk-container-id-12 div.sk-parallel-item { display: flex; flex-direction: column; } #sk-container-id-12 div.sk-parallel-item:first-child::after { align-self: flex-end; width: 50%; } #sk-container-id-12 div.sk-parallel-item:last-child::after { align-self: flex-start; width: 50%; } #sk-container-id-12 div.sk-parallel-item:only-child::after { width: 0; } /* Serial-specific style estimator block */ #sk-container-id-12 div.sk-serial { display: flex; flex-direction: column; align-items: center; background-color: var(--sklearn-color-background); padding-right: 1em; padding-left: 1em; } /* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is clickable and can be expanded/collapsed. - Pipeline and ColumnTransformer use this feature and define the default style - Estimators will overwrite some part of the style using the `sk-estimator` class */ /* Pipeline and ColumnTransformer style (default) */ #sk-container-id-12 div.sk-toggleable { /* Default theme specific background. It is overwritten whether we have a specific estimator or a Pipeline/ColumnTransformer */ background-color: var(--sklearn-color-background); } /* Toggleable label */ #sk-container-id-12 label.sk-toggleable__label { cursor: pointer; display: block; width: 100%; margin-bottom: 0; padding: 0.5em; box-sizing: border-box; text-align: center; } #sk-container-id-12 label.sk-toggleable__label-arrow:before { /* Arrow on the left of the label */ content: "▸"; float: left; margin-right: 0.25em; color: var(--sklearn-color-icon); } #sk-container-id-12 label.sk-toggleable__label-arrow:hover:before { color: var(--sklearn-color-text); } /* Toggleable content - dropdown */ #sk-container-id-12 div.sk-toggleable__content { max-height: 0; max-width: 0; overflow: hidden; text-align: left; /* unfitted */ background-color: var(--sklearn-color-unfitted-level-0); } #sk-container-id-12 div.sk-toggleable__content.fitted { /* fitted */ background-color: var(--sklearn-color-fitted-level-0); } #sk-container-id-12 div.sk-toggleable__content pre { margin: 0.2em; border-radius: 0.25em; color: var(--sklearn-color-text); /* unfitted */ background-color: var(--sklearn-color-unfitted-level-0); } #sk-container-id-12 div.sk-toggleable__content.fitted pre { /* unfitted */ background-color: var(--sklearn-color-fitted-level-0); } #sk-container-id-12 input.sk-toggleable__control:checked~div.sk-toggleable__content { /* Expand drop-down */ max-height: 200px; max-width: 100%; overflow: auto; } #sk-container-id-12 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before { content: "▾"; } /* Pipeline/ColumnTransformer-specific style */ #sk-container-id-12 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label { color: var(--sklearn-color-text); background-color: var(--sklearn-color-unfitted-level-2); } #sk-container-id-12 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label { background-color: var(--sklearn-color-fitted-level-2); } /* Estimator-specific style */ /* Colorize estimator box */ #sk-container-id-12 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-2); } #sk-container-id-12 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label { /* fitted */ background-color: var(--sklearn-color-fitted-level-2); } #sk-container-id-12 div.sk-label label.sk-toggleable__label, #sk-container-id-12 div.sk-label label { /* The background is the default theme color */ color: var(--sklearn-color-text-on-default-background); } /* On hover, darken the color of the background */ #sk-container-id-12 div.sk-label:hover label.sk-toggleable__label { color: var(--sklearn-color-text); background-color: var(--sklearn-color-unfitted-level-2); } /* Label box, darken color on hover, fitted */ #sk-container-id-12 div.sk-label.fitted:hover label.sk-toggleable__label.fitted { color: var(--sklearn-color-text); background-color: var(--sklearn-color-fitted-level-2); } /* Estimator label */ #sk-container-id-12 div.sk-label label { font-family: monospace; font-weight: bold; display: inline-block; line-height: 1.2em; } #sk-container-id-12 div.sk-label-container { text-align: center; } /* Estimator-specific */ #sk-container-id-12 div.sk-estimator { font-family: monospace; border: 1px dotted var(--sklearn-color-border-box); border-radius: 0.25em; box-sizing: border-box; margin-bottom: 0.5em; /* unfitted */ background-color: var(--sklearn-color-unfitted-level-0); } #sk-container-id-12 div.sk-estimator.fitted { /* fitted */ background-color: var(--sklearn-color-fitted-level-0); } /* on hover */ #sk-container-id-12 div.sk-estimator:hover { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-2); } #sk-container-id-12 div.sk-estimator.fitted:hover { /* fitted */ background-color: var(--sklearn-color-fitted-level-2); } /* Specification for estimator info (e.g. "i" and "?") */ /* Common style for "i" and "?" */ .sk-estimator-doc-link, a:link.sk-estimator-doc-link, a:visited.sk-estimator-doc-link { float: right; font-size: smaller; line-height: 1em; font-family: monospace; background-color: var(--sklearn-color-background); border-radius: 1em; height: 1em; width: 1em; text-decoration: none !important; margin-left: 1ex; /* unfitted */ border: var(--sklearn-color-unfitted-level-1) 1pt solid; color: var(--sklearn-color-unfitted-level-1); } .sk-estimator-doc-link.fitted, a:link.sk-estimator-doc-link.fitted, a:visited.sk-estimator-doc-link.fitted { /* fitted */ border: var(--sklearn-color-fitted-level-1) 1pt solid; color: var(--sklearn-color-fitted-level-1); } /* On hover */ div.sk-estimator:hover .sk-estimator-doc-link:hover, .sk-estimator-doc-link:hover, div.sk-label-container:hover .sk-estimator-doc-link:hover, .sk-estimator-doc-link:hover { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-3); color: var(--sklearn-color-background); text-decoration: none; } div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover, .sk-estimator-doc-link.fitted:hover, div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover, .sk-estimator-doc-link.fitted:hover { /* fitted */ background-color: var(--sklearn-color-fitted-level-3); color: var(--sklearn-color-background); text-decoration: none; } /* Span, style for the box shown on hovering the info icon */ .sk-estimator-doc-link span { display: none; z-index: 9999; position: relative; font-weight: normal; right: .2ex; padding: .5ex; margin: .5ex; width: min-content; min-width: 20ex; max-width: 50ex; color: var(--sklearn-color-text); box-shadow: 2pt 2pt 4pt #999; /* unfitted */ background: var(--sklearn-color-unfitted-level-0); border: .5pt solid var(--sklearn-color-unfitted-level-3); } .sk-estimator-doc-link.fitted span { /* fitted */ background: var(--sklearn-color-fitted-level-0); border: var(--sklearn-color-fitted-level-3); } .sk-estimator-doc-link:hover span { display: block; } /* "?"-specific style due to the `` HTML tag */ #sk-container-id-12 a.estimator_doc_link { float: right; font-size: 1rem; line-height: 1em; font-family: monospace; background-color: var(--sklearn-color-background); border-radius: 1rem; height: 1rem; width: 1rem; text-decoration: none; /* unfitted */ color: var(--sklearn-color-unfitted-level-1); border: var(--sklearn-color-unfitted-level-1) 1pt solid; } #sk-container-id-12 a.estimator_doc_link.fitted { /* fitted */ border: var(--sklearn-color-fitted-level-1) 1pt solid; color: var(--sklearn-color-fitted-level-1); } /* On hover */ #sk-container-id-12 a.estimator_doc_link:hover { /* unfitted */ background-color: var(--sklearn-color-unfitted-level-3); color: var(--sklearn-color-background); text-decoration: none; } #sk-container-id-12 a.estimator_doc_link.fitted:hover { /* fitted */ background-color: var(--sklearn-color-fitted-level-3); } </style>
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['work_year',
                                                                          'salary',
                                                                          'remote_ratio']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['experience_level',
                                                                          'employment_type',
                                                                          'job_title',
                                                                          'company_size'])])),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=-1, param_grid={'classifier__n_neighbors': [3, 5, 7]},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
?Documentation for GridSearchCViFitted
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['work_year',
                                                                          'salary',
                                                                          'remote_ratio']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['experience_level',
                                                                          'employment_type',
                                                                          'job_title',
                                                                          'company_size'])])),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=-1, param_grid={'classifier__n_neighbors': [3, 5, 7]},
             scoring='accuracy')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['work_year', 'salary',
                                                   'remote_ratio']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['experience_level',
                                                   'employment_type',
                                                   'job_title',
                                                   'company_size'])])),
                ('classifier', KNeighborsClassifier(n_neighbors=3))])
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['work_year', 'salary', 'remote_ratio']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['experience_level', 'employment_type',
                                  'job_title', 'company_size'])])
['work_year', 'salary', 'remote_ratio']
SimpleImputer(strategy='median')
StandardScaler()
['experience_level', 'employment_type', 'job_title', 'company_size']
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore')
KNeighborsClassifier(n_neighbors=3)

Оценка качества моделей для регрессии

In [276]:
print("Оценка качества моделей для регрессии:")
for name, pipeline in pipelines_reg.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"{name}:")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"R^2: {r2_score(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
    print()
Оценка качества моделей для регрессии:
Linear Regression:
MSE: 2999356500.395046
R^2: 0.2402441301538446
MAE: 42627.855980789136

Random Forest:
MSE: 292999814.7487738
R^2: 0.9257813037263503
MAE: 3368.018476697737

Gradient Boosting:
MSE: 357682105.0977604
R^2: 0.9093968726788028
MAE: 4417.4176184728085

Оценка качества моделей для классификации

In [277]:
print("Оценка качества моделей для классификации:")
for name, pipeline in pipelines_clf.items():
    pipeline.fit(X_train_clf, y_train_clf)
    y_pred = pipeline.predict(X_test_clf)
    print(f"{name}:")
    print(f"Accuracy: {accuracy_score(y_test_clf, y_pred)}")
    print(f"Precision: {precision_score(y_test_clf, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_test_clf, y_pred, average='weighted')}")
    print(f"F1-Score: {f1_score(y_test_clf, y_pred, average='weighted')}")
    print()
Оценка качества моделей для классификации:
Logistic Regression:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0

Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0

KNN:
Accuracy: 0.9933422103861518
Precision: 0.9934420772303596
Recall: 0.9933422103861518
F1-Score: 0.9933125276114639

Оценка смещения и дисперсии

In [278]:
print("Оценка смещения и дисперсии для регрессии:")
for name, pipeline in pipelines_reg.items():
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f"{name}: MSE = {-scores.mean()} (+/- {scores.std()})")

print("\nОценка смещения и дисперсии для классификации:")
for name, pipeline in pipelines_clf.items():
    scores = cross_val_score(pipeline, X_train_clf, y_train_clf, cv=5, scoring='accuracy')
    print(f"{name}: Accuracy = {scores.mean()} (+/- {scores.std()})")
Оценка смещения и дисперсии для регрессии:
Linear Regression: MSE = 2950641110.570735 (+/- 218302870.26943365)
Random Forest: MSE = 437418796.4746934 (+/- 233308697.220604)
Gradient Boosting: MSE = 447250778.1629532 (+/- 304985277.2195829)

Оценка смещения и дисперсии для классификации:
Logistic Regression: Accuracy = 1.0 (+/- 0.0)
Random Forest: Accuracy = 1.0 (+/- 0.0)
KNN: Accuracy = 0.9890155296727677 (+/- 0.0016924977064204155)

Сравнение моделей со старыми гиперпараметрами и новыми

In [279]:
# Лучшая модель для Random Forest
best_rf_model = grid_search_rf.best_estimator_
best_rf_params = grid_search_rf.best_params_
best_rf_mse = -grid_search_rf.best_score_  # Меняем знак, так как MSE был отрицательным

# Лучшая модель для Gradient Boosting
best_gb_model = grid_search_gb.best_estimator_
best_gb_params = grid_search_gb.best_params_
best_gb_mse = -grid_search_gb.best_score_  # Меняем знак, так как MSE был отрицательным

# Лучшая модель для KNN
best_knn_model = grid_search_knn.best_estimator_
best_knn_params = grid_search_knn.best_params_
best_knn_accuracy = grid_search_knn.best_score_

# Прогнозирование для тестовой выборки (с новыми гиперпараметрами)
y_pred_rf = best_rf_model.predict(X_test)
y_pred_gb = best_gb_model.predict(X_test)
y_pred_knn = best_knn_model.predict(X_test_clf)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
    
# Старый RandomForest
old_rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, max_depth=5))  # Старые гиперпараметры
])
old_rf_model.fit(X_train, y_train)

# Старый Gradient Boosting
old_gb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=50, learning_rate=0.1))  # Старые гиперпараметры
])
old_gb_model.fit(X_train, y_train)

# Старый KNN
old_knn_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # Старые гиперпараметры
])
old_knn_model.fit(X_train_clf, y_train_clf)

# Прогнозирование для старых моделей
y_oldpred_rf = old_rf_model.predict(X_test)
y_oldpred_gb = old_gb_model.predict(X_test)
y_oldpred_knn = old_knn_model.predict(X_test_clf)

# Прогнозирование для старых параметров (нужно использовать старые модели)
y_oldpred_rf = old_rf_model.predict(X_test)  # Старое обучение для Random Forest
y_oldpred_gb = old_gb_model.predict(X_test)  # Старое обучение для Gradient Boosting
y_oldpred_knn = old_knn_model.predict(X_test_clf)  # Старое обучение для KNN

# Метрики для сравнения

# MSE для Random Forest
mse_rf_new = mean_squared_error(y_test, y_pred_rf)
mse_rf_old = mean_squared_error(y_test, y_oldpred_rf)

# MSE для Gradient Boosting
mse_gb_new = mean_squared_error(y_test, y_pred_gb)
mse_gb_old = mean_squared_error(y_test, y_oldpred_gb)

# Точность для KNN
accuracy_knn_new = accuracy_score(y_test_clf, y_pred_knn)
accuracy_knn_old = accuracy_score(y_test_clf, y_oldpred_knn)

# Вывод результатов
print(f"Лучшие параметры для Random Forest: {best_rf_params}")
print(f"Лучший MSE для Random Forest: {best_rf_mse}")
print(f"MSE для Random Forest с новыми параметрами: {mse_rf_new}")
print(f"MSE для Random Forest со старыми параметрами: {mse_rf_old}")
print()
print(f"Лучшие параметры для Gradient Boosting: {best_gb_params}")
print(f"Лучший MSE для Gradient Boosting: {best_gb_mse}")
print(f"MSE для Gradient Boosting с новыми параметрами: {mse_gb_new}")
print(f"MSE для Gradient Boosting со старыми параметрами: {mse_gb_old}")
print()
print(f"Лучшие параметры для KNN: {best_knn_params}")
print(f"Точность для KNN с новыми параметрами: {accuracy_knn_new}")
print(f"Точность для KNN со старыми параметрами: {accuracy_knn_old}")
Лучшие параметры для Random Forest: {'regressor__max_depth': 20, 'regressor__n_estimators': 100}
Лучший MSE для Random Forest: 491306262.8668668
MSE для Random Forest с новыми параметрами: 290541051.467423
MSE для Random Forest со старыми параметрами: 264879570.99534488

Лучшие параметры для Gradient Boosting: {'regressor__learning_rate': 0.1, 'regressor__n_estimators': 100}
Лучший MSE для Gradient Boosting: 516767095.68065095
MSE для Gradient Boosting с новыми параметрами: 357682105.0977604
MSE для Gradient Boosting со старыми параметрами: 339496438.89939606

Лучшие параметры для KNN: {'classifier__n_neighbors': 3}
Точность для KNN с новыми параметрами: 0.9933422103861518
Точность для KNN со старыми параметрами: 0.9933422103861518

Визуализация результатов

In [280]:
# График для регрессии
plt.figure(figsize=(10, 6))
plt.scatter(y_test, pipelines_reg["Gradient Boosting"].predict(X_test), alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Реальные значения')
plt.ylabel('Предсказанные значения')
plt.title('Реальные vs Предсказанные значения зарплаты (Gradient Boosting)')
plt.show()

plt.figure(figsize=(10, 5))

# График для Random Forest
plt.figure(figsize=(10, 5))
plt.scatter(range(len(y_test)), y_test, label="Актуальные значения", color="black", alpha=0.5, s=10)
plt.scatter(range(len(y_test)), y_pred_rf, label="Предсказанные (Random Forest, новые параметры)", color="blue", alpha=0.5, s=10)
plt.scatter(range(len(y_test)), y_oldpred_rf, label="Предсказанные (Random Forest, старые параметры)", color="red", alpha=0.5, s=10)
plt.xlabel("Выборка")
plt.ylabel("Значения")
plt.legend()
plt.title("Random Forest: Актуальные vs Предсказанные значения")
plt.show()

# График для Gradient Boosting
plt.figure(figsize=(10, 5))
plt.scatter(range(len(y_test)), y_test, label="Актуальные значения", color="black", alpha=0.5, s=10)
plt.scatter(range(len(y_test)), y_pred_gb, label="Предсказанные (Gradient Boosting, новые параметры)", color="green", alpha=0.5, s=10)
plt.scatter(range(len(y_test)), y_oldpred_gb, label="Предсказанные (Gradient Boosting, старые параметры)", color="orange", alpha=0.5, s=10)
plt.xlabel("Выборка")
plt.ylabel("Значения")
plt.legend()
plt.title("Gradient Boosting: Актуальные vs Предсказанные значения")
plt.show()

# График для KNN
plt.figure(figsize=(10, 5))
plt.scatter(range(len(y_test_clf)), y_test_clf, label="Актуальные значения", color="black", alpha=0.5, s=10)
plt.scatter(range(len(y_test_clf)), y_pred_knn, label="Предсказанные (KNN, новые параметры)", color="purple", alpha=0.5, s=10)
plt.scatter(range(len(y_test_clf)), y_oldpred_knn, label="Предсказанные (KNN, старые параметры)", color="yellow", alpha=0.5, s=10)
plt.xlabel("Выборка")
plt.ylabel("Значения")
plt.legend()
plt.title("KNN: Актуальные vs Предсказанные значения")
plt.show()
No description has been provided for this image
<Figure size 1000x500 with 0 Axes>
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image