61 lines
2.3 KiB
Python
61 lines
2.3 KiB
Python
import pandas as pd
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.linear_model import Lasso
|
||
from sklearn.metrics import mean_squared_error
|
||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.pipeline import Pipeline
|
||
import matplotlib.pyplot as plt
|
||
|
||
# Загрузка данных
|
||
file_path = 'ds_salaries.csv'
|
||
data = pd.read_csv(file_path)
|
||
|
||
# Предварительная обработка данных
|
||
categorical_features = ['experience_level', 'employment_type', 'company_location', 'company_size']
|
||
numeric_features = ['work_year']
|
||
|
||
preprocessor = ColumnTransformer(
|
||
transformers=[
|
||
('num', StandardScaler(), numeric_features),
|
||
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
|
||
])
|
||
|
||
# Выбор признаков
|
||
features = ['work_year', 'experience_level', 'employment_type', 'company_location', 'company_size']
|
||
X = data[features]
|
||
y = data['salary_in_usd']
|
||
|
||
# Разделение данных на обучающий и тестовый наборы
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||
|
||
# Создание и обучение модели с использованием предварительного обработчика данных
|
||
alpha = 0.01
|
||
lasso_model = Pipeline([
|
||
('preprocessor', preprocessor),
|
||
('lasso', Lasso(alpha=alpha))
|
||
])
|
||
|
||
lasso_model.fit(X_train, y_train)
|
||
|
||
# Получение прогнозов
|
||
y_pred = lasso_model.predict(X_test)
|
||
|
||
# Оценка точности модели
|
||
accuracy = lasso_model.score(X_test, y_test)
|
||
mse = mean_squared_error(y_test, y_pred)
|
||
|
||
print(f"R^2 Score: {accuracy:.2f}")
|
||
print(f"Mean Squared Error: {mse:.2f}")
|
||
|
||
# Вывод предсказанных и фактических значений
|
||
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
|
||
print(predictions_df)
|
||
|
||
# Визуализация весов (коэффициентов) модели
|
||
coefficients = pd.Series(lasso_model.named_steps['lasso'].coef_, index=numeric_features + list(lasso_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names(categorical_features)))
|
||
plt.figure(figsize=(10, 6))
|
||
coefficients.sort_values().plot(kind='barh')
|
||
plt.title('Lasso Regression Coefficients')
|
||
plt.show()
|