AIM-PIbd-32-Kaznacheeva-E-K/Lab4.ipynb at 8d529f0142211c751752db6e3e8f9b3008e6537a

2024-11-23 12:21:27 +04:00

265 KiB

Raw Blame History

Начало лабораторной работы

In [8]:

import pandas as pd
df = pd.read_csv("..//static//csv//ds_salaries.csv")
print(df.columns)

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [10]:

df.head()

Out[10]:

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
0	2023	SE	FT	Principal Data Scientist	80000	EUR	85847	ES	100	ES	L
1	2023	MI	CT	ML Engineer	30000	USD	30000	US	100	US	S
2	2023	MI	CT	ML Engineer	25500	USD	25500	US	100	US	S
3	2023	SE	FT	Data Scientist	175000	USD	175000	CA	100	CA	M
4	2023	SE	FT	Data Scientist	120000	USD	120000	CA	100	CA	M

In [9]:

df.describe()

Out[9]:

	work_year	salary	salary_in_usd	remote_ratio
count	3755.000000	3.755000e+03	3755.000000	3755.000000
mean	2022.373635	1.906956e+05	137570.389880	46.271638
std	0.691448	6.716765e+05	63055.625278	48.589050
min	2020.000000	6.000000e+03	5132.000000	0.000000
25%	2022.000000	1.000000e+05	95000.000000	0.000000
50%	2022.000000	1.380000e+05	135000.000000	0.000000
75%	2023.000000	1.800000e+05	175000.000000	100.000000
max	2023.000000	3.040000e+07	450000.000000	100.000000

In [11]:

# Процент пропущенных значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f'{i} Процент пустых значений: %{null_rate:.2f}')

print(df.isnull().sum())

print(df.isnull().any())

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
work_year             False
experience_level      False
employment_type       False
job_title             False
salary                False
salary_currency       False
salary_in_usd         False
employee_residence    False
remote_ratio          False
company_location      False
company_size          False
dtype: bool

Классификация

In [ ]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']

# Примерная категоризация
df['salary_category'] = pd.cut(df['salary_in_usd'], bins=[0, 100000, 200000, np.inf], labels=[0, 1, 2])

# Выбор признаков и целевых переменных
X = df.drop(columns=['salary_in_usd', 'salary_category'])

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)

    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="above_median_salary", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

# Проверка преобразования
print(df.dtypes)

# Визуализация распределения зарплат
plt.figure(figsize=(10, 6))
sns.histplot(df['salary_in_usd'], bins=50, kde=True)
plt.title('Распределение зарплат')
plt.xlabel('Зарплата (USD)')
plt.ylabel('Частота')
plt.show()

# Визуализация зависимости между зарплатой и уровнем опыта
plt.figure(figsize=(10, 6))
sns.boxplot(x='experience_level', y='salary_in_usd', data=df)
plt.title('Зависимость зарплаты от уровня опыта')
plt.xlabel('Уровень опыта')
plt.ylabel('Зарплата (USD)')
plt.show()

'X_train'

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	above_median_salary	salary_category
1809	2023	SE	FT	Data Engineer	182000	USD	182000	US	100	US	M	1	1
1082	2023	SE	FT	Machine Learning Engineer	126000	USD	126000	US	0	US	M	0	1
1686	2023	SE	FT	BI Developer	140000	USD	140000	US	100	US	M	1	1
1600	2023	SE	FT	Data Scientist	140000	USD	140000	US	0	US	M	1	1
1376	2023	SE	FT	Data Engineer	226700	USD	226700	US	0	US	M	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2706	2022	SE	FT	Data Engineer	160000	USD	160000	US	100	US	M	1	1
928	2023	MI	FT	Data Engineer	200000	USD	200000	US	0	US	M	1	1
564	2023	MI	FT	Data Engineer	140000	USD	140000	US	0	US	M	1	1
716	2023	SE	FT	Data Scientist	297300	USD	297300	US	100	US	M	1	2
1299	2023	SE	FT	Data Engineer	133832	USD	133832	US	0	US	M	0	1

3004 rows × 13 columns

'y_train'

	above_median_salary
1809	1
1082	0
1686	1
1600	1
1376	1
...	...
2706	1
928	1
564	1
716	1
1299	0

3004 rows × 1 columns

'X_test'

	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size	above_median_salary	salary_category
3459	2022	MI	FT	Research Scientist	59000	EUR	61989	AT	0	AT	L	0	0
3724	2021	EN	FT	Business Data Analyst	50000	EUR	59102	LU	100	LU	L	0	0
1795	2023	SE	FT	Data Engineer	180000	USD	180000	US	0	US	M	1	1
3535	2021	MI	FT	Data Scientist	50000	USD	50000	NG	100	NG	L	0	0
3255	2022	MI	FT	Data Analyst	106260	USD	106260	US	0	US	M	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...
1943	2022	MI	FT	Data Engineer	120000	USD	120000	US	100	US	M	0	1
573	2023	EN	FT	Autonomous Vehicle Technician	7000	USD	7000	GH	0	GH	S	0	0
3013	2022	SE	FT	Machine Learning Engineer	129300	USD	129300	US	0	US	M	0	1
327	2023	EN	FT	Data Scientist	70000	CAD	51753	CA	100	CA	L	0	0
1565	2023	SE	FT	Data Analyst	48000	EUR	51508	ES	0	ES	M	0	0

751 rows × 13 columns

'y_test'

	above_median_salary
3459	0
3724	0
1795	1
3535	0
3255	0
...	...
1943	0
573	0
3013	0
327	0
1565	0

751 rows × 1 columns

work_year                 int64
experience_level         object
employment_type          object
job_title                object
salary                    int64
salary_currency          object
salary_in_usd             int64
employee_residence       object
remote_ratio              int64
company_location         object
company_size             object
above_median_salary       int64
salary_category        category
dtype: object

No description has been provided for this image

Теперь перейдем к делению на выборки и созданию ориентира

In [26]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']
target = 'above_median_salary'

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42, stratify=df[target])

print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)

# Создание ориентира (baseline)
baseline_threshold = y_train.mean()
baseline_predictions = [1 if pred > baseline_threshold else 0 for pred in [baseline_threshold] * len(y_test)]

# Вычисление метрик для ориентира
baseline_accuracy = accuracy_score(y_test, baseline_predictions)
baseline_f1 = f1_score(y_test, baseline_predictions, average='weighted')

print('Baseline Accuracy:', baseline_accuracy)
print('Baseline F1 Score:', baseline_f1)

Размер обучающей выборки: (3004, 10)
Размер тестовой выборки: (751, 10)
Baseline Accuracy: 0.5126498002663116
Baseline F1 Score: 0.3474826991241725

Создание конвейера и обучение моделей

In [ ]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Определение столбцов
numeric_columns = ["work_year", "salary", "remote_ratio"]
cat_columns = ["experience_level", "employment_type", "job_title", "salary_currency", "employee_residence", "company_location", "company_size"]

# Предобработка данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)])

# Создание конвейеров для моделей
pipeline_logistic_regression = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))])

pipeline_decision_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))])

pipeline_gradient_boosting = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))])

# Список конвейеров 
pipelines = [
    ('Logistic Regression', pipeline_logistic_regression),
    ('Decision Tree', pipeline_decision_tree),
    ('Gradient Boosting', pipeline_gradient_boosting)
]

# Обучение моделей и вывод результатов
for name, pipeline in pipelines:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 40)

Model: Logistic Regression
Accuracy: 0.7523
F1 Score: 0.7609
----------------------------------------
Model: Decision Tree
Accuracy: 0.9960
F1 Score: 0.9959
----------------------------------------
Model: Gradient Boosting
Accuracy: 0.9947
F1 Score: 0.9945
----------------------------------------

Оценка качества моделей

In [27]:

from sklearn.metrics import accuracy_score, f1_score

for name, pipeline in pipelines:
    y_pred = pipeline.predict(X_test)
    print(f"Model: {name}")
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))
    print()

Model: Logistic Regression
Accuracy: 0.7523302263648469
F1 Score: 0.7517841210039291

Model: Decision Tree
Accuracy: 0.996005326231691
F1 Score: 0.9960048583691977

Model: Gradient Boosting
Accuracy: 0.9946737683089214
F1 Score: 0.9946728986768623

Регрессия Цель: Разработать модель регрессии, которая будет предсказывать зарплату (salary_in_usd) на основе демографических данных, типа работы и других факторов.

In [28]:

import pandas as pd
from scipy import stats

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Определение числовых признаков
numeric_features = ['work_year', 'salary', 'salary_in_usd', 'remote_ratio']

# Вычисление z-оценок для числовых признаков
z_scores = stats.zscore(df[numeric_features])

# Определение порога для удаления выбросов
threshold = 3

# Удаление выбросов
df_cleaned = df[(z_scores < threshold).all(axis=1)]

print("Размер данных до удаления выбросов:", df.shape)
print("Размер данных после удаления выбросов:", df_cleaned.shape)

Размер данных до удаления выбросов: (3755, 11)
Размер данных после удаления выбросов: (3708, 11)

In [29]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Определение признаков и целевой переменной
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']
target = 'salary_in_usd'

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)

print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)

# Создание ориентира (baseline)
baseline_predictions = [y_train.mean()] * len(y_test)

# Вычисление метрик для ориентира
print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))
print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))
print('Baseline R²:', r2_score(y_test, baseline_predictions))

Размер обучающей выборки: (2966, 9)
Размер тестовой выборки: (742, 9)
Baseline MAE: 48988.97819674187
Baseline MSE: 3791583837.2779293
Baseline R²: -0.005051587587466155

In [31]:

import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Определение числовых признаков
numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']

# Вычисление z-оценок для числовых признаков
z_scores = stats.zscore(df[numeric_features])

# Определение порога для удаления выбросов
threshold = 3

# Удаление выбросов
df_cleaned = df[(z_scores < threshold).all(axis=1)]

print("Размер данных до удаления выбросов:", df.shape)
print("Размер данных после удаления выбросов:", df_cleaned.shape)

# Разделение на выборки и создание ориентира
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']
target = 'salary_in_usd'

X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)

print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)

# Создание ориентира (baseline)
baseline_predictions = [y_train.mean()] * len(y_test)

print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))
print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))
print('Baseline R²:', r2_score(y_test, baseline_predictions))

# Создание конвейера и обучение моделей
categorical_features = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

pipeline_linear_regression = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())])

pipeline_decision_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))])

pipeline_gradient_boosting = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))])

pipelines = [
    ('Linear Regression', pipeline_linear_regression),
    ('Decision Tree', pipeline_decision_tree),
    ('Gradient Boosting', pipeline_gradient_boosting)
]

for name, pipeline in pipelines:
    pipeline.fit(X_train, y_train)
    print(f"Model: {name} trained.")

# Оценка качества моделей
for name, pipeline in pipelines:
    y_pred = pipeline.predict(X_test)
    print(f"Model: {name}")
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('R²:', r2_score(y_test, y_pred))
    print()

Размер данных до удаления выбросов: (3755, 11)
Размер данных после удаления выбросов: (3733, 11)
Размер обучающей выборки: (2986, 9)
Размер тестовой выборки: (747, 9)
Baseline MAE: 47593.92288600708
Baseline MSE: 3680965527.9964128
Baseline R²: -0.0016576422593919116
Model: Linear Regression trained.
Model: Decision Tree trained.
Model: Gradient Boosting trained.
Model: Linear Regression
MAE: 36617.65439873256
MSE: 2194684192.4416404
R²: 0.4027865306031213

Model: Decision Tree
MAE: 36516.71804922624
MSE: 2246643776.062331
R²: 0.38864738324451775

Model: Gradient Boosting
MAE: 35842.80843437428
MSE: 2125285552.2470944
R²: 0.42167116230764956

In [32]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

for name, pipeline in pipelines:
    y_pred = pipeline.predict(X_test)
    print(f"Model: {name}")
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('R²:', r2_score(y_test, y_pred))
    print()

Model: Linear Regression
MAE: 36617.65439873256
MSE: 2194684192.4416404
R²: 0.4027865306031213

Model: Decision Tree
MAE: 36516.71804922624
MSE: 2246643776.062331
R²: 0.38864738324451775

Model: Gradient Boosting
MAE: 35842.80843437428
MSE: 2125285552.2470944
R²: 0.42167116230764956

In [43]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Проверка на пропущенные значения
print("Пропущенные значения:\n", df.isnull().sum())

# Удаление строк с пропущенными значениями
df = df.dropna()

# Выбор признаков и целевой переменной
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location', 'company_size']
target = 'salary_in_usd'

# Определение категориальных и числовых признаков
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']

# Создание пайплайна для обработки данных
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Преобразование данных
X = preprocessor.fit_transform(df[features])
y = df[target]

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Пропущенные значения:
 work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [47]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scipy.stats import uniform, randint

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# ... (ваш код предобработки данных, как в предыдущем примере) ...

# Определение распределений для гиперпараметров
param_distributions = {
  'Linear Regression': {
    'regressor__fit_intercept': [True, False],
    'regressor__positive': [True, False]
  },
  'Random Forest': {
    'regressor__n_estimators': randint(50, 200),
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': randint(2, 11),
    'regressor__min_samples_leaf': randint(1, 5),
    'regressor__bootstrap': [True, False]
  },
  'Gradient Boosting': {
    'regressor__n_estimators': randint(50, 200),
    'regressor__learning_rate': uniform(0.01, 0.49), # uniform distribution for learning rate
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': randint(2, 11),
    'regressor__min_samples_leaf': randint(1, 5),
    'regressor__subsample': uniform(0.5, 0.5) # uniform distribution for subsample

  }
}

# Словарь для хранения лучших моделей и их гиперпараметров
best_models = {}

# Цикл для обучения и настройки гиперпараметров каждой модели
for model_name, model_params in param_distributions.items():
  if model_name == 'Linear Regression':
    model = LinearRegression()
  elif model_name == 'Random Forest':
    model = RandomForestRegressor(random_state=42)
  elif model_name == 'Gradient Boosting':
    model = GradientBoostingRegressor(random_state=42)
  else:
    continue #Обработка неизвестных моделей

  pipeline = Pipeline([('regressor', model)])
  random_search = RandomizedSearchCV(pipeline, param_distributions=model_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
  random_search.fit(X_train, y_train)
  best_models[model_name] = random_search.best_params_


# Визуализация лучших гиперпараметров

fig, axes = plt.subplots(len(best_models), 1, figsize=(10, 5 * len(best_models)))
if len(best_models) == 1:
  axes = [axes] # обработка случая с одной моделью

for i, (model_name, params) in enumerate(best_models.items()):
  axes[i].bar(params.keys(), params.values())
  axes[i].set_title(f"Лучшие гиперпараметры для {model_name}")
  axes[i].set_xticklabels(params.keys(), rotation=45, ha="right") #Поворачиваем подписи на оси х
  axes[i].tick_params(axis='x', which='major', labelsize=8) # Размер шрифта подписей оси х

plt.tight_layout()
plt.show()

d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\model_selection\_search.py:320: UserWarning: The total space of parameters 4 is smaller than n_iter=10. Running 4 iterations. For exhaustive searches, use GridSearchCV.
  warnings.warn(
d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\model_selection\_validation.py:540: FitFailedWarning: 
6 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\linear_model\_base.py", line 609, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\validation.py", line 971, in check_array
    array = _ensure_sparse_format(
            ^^^^^^^^^^^^^^^^^^^^^^
  File "d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\validation.py", line 595, in _ensure_sparse_format
    raise TypeError(
TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\model_selection\_search.py:1103: UserWarning: One or more of the test scores are non-finite: [       nan 0.37308723        nan 0.37316524]
  warnings.warn(
C:\Users\user\AppData\Local\Temp\ipykernel_14908\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[i].set_xticklabels(params.keys(), rotation=45, ha="right") #Поворачиваем подписи на оси х
C:\Users\user\AppData\Local\Temp\ipykernel_14908\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[i].set_xticklabels(params.keys(), rotation=45, ha="right") #Поворачиваем подписи на оси х
C:\Users\user\AppData\Local\Temp\ipykernel_14908\2948510432.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[i].set_xticklabels(params.keys(), rotation=45, ha="right") #Поворачиваем подписи на оси х

265 KiB Raw Blame History Unescape Escape

265 KiB

Raw Blame History