IIS_2023_1/degtyarev_mikhail_lab_5/main.py

61 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
# Загрузка данных
file_path = 'ds_salaries.csv'
data = pd.read_csv(file_path)
# Предварительная обработка данных
categorical_features = ['experience_level', 'employment_type', 'company_location', 'company_size']
numeric_features = ['work_year']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# Выбор признаков
features = ['work_year', 'experience_level', 'employment_type', 'company_location', 'company_size']
X = data[features]
y = data['salary_in_usd']
# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Создание и обучение модели с использованием предварительного обработчика данных
alpha = 0.01
lasso_model = Pipeline([
('preprocessor', preprocessor),
('lasso', Lasso(alpha=alpha))
])
lasso_model.fit(X_train, y_train)
# Получение прогнозов
y_pred = lasso_model.predict(X_test)
# Оценка точности модели
accuracy = lasso_model.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print(f"R^2 Score: {accuracy:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
# Вывод предсказанных и фактических значений
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df)
# Визуализация весов (коэффициентов) модели
coefficients = pd.Series(lasso_model.named_steps['lasso'].coef_, index=numeric_features + list(lasso_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names(categorical_features)))
plt.figure(figsize=(10, 6))
coefficients.sort_values().plot(kind='barh')
plt.title('Lasso Regression Coefficients')
plt.show()