265 KiB
265 KiB
Начало лабораторной работы
In [8]:
import pandas as pd
df = pd.read_csv("..//static//csv//ds_salaries.csv")
print(df.columns)
In [10]:
df.head()
Out[10]:
In [9]:
df.describe()
Out[9]:
In [11]:
# Процент пропущенных значений признаков
for i in df.columns:
null_rate = df[i].isnull().sum() / len(df) * 100
if null_rate > 0:
print(f'{i} Процент пустых значений: %{null_rate:.2f}')
print(df.isnull().sum())
print(df.isnull().any())
Классификация
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)
# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']
# Примерная категоризация
df['salary_category'] = pd.cut(df['salary_in_usd'], bins=[0, 100000, 200000, np.inf], labels=[0, 1, 2])
# Выбор признаков и целевых переменных
X = df.drop(columns=['salary_in_usd', 'salary_category'])
def split_stratified_into_train_val_test(
df_input,
stratify_colname="y",
frac_train=0.6,
frac_val=0.15,
frac_test=0.25,
random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
if frac_train + frac_val + frac_test != 1.0:
raise ValueError(
"fractions %f, %f, %f do not add up to 1.0"
% (frac_train, frac_val, frac_test)
)
if stratify_colname not in df_input.columns:
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
X = df_input # Contains all columns.
y = df_input[
[stratify_colname]
] # Dataframe of just the column on which to stratify.
# Split original dataframe into train and temp dataframes.
df_train, df_temp, y_train, y_temp = train_test_split(
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
)
if frac_val <= 0:
assert len(df_input) == len(df_train) + len(df_temp)
return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
# Split the temp dataframe into val and test dataframes.
relative_frac_test = frac_test / (frac_val + frac_test)
df_val, df_test, y_val, y_test = train_test_split(
df_temp,
y_temp,
stratify=y_temp,
test_size=relative_frac_test,
random_state=random_state,
)
assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
return df_train, df_val, df_test, y_train, y_val, y_test
X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
df, stratify_colname="above_median_salary", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)
display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
# Проверка преобразования
print(df.dtypes)
# Визуализация распределения зарплат
plt.figure(figsize=(10, 6))
sns.histplot(df['salary_in_usd'], bins=50, kde=True)
plt.title('Распределение зарплат')
plt.xlabel('Зарплата (USD)')
plt.ylabel('Частота')
plt.show()
# Визуализация зависимости между зарплатой и уровнем опыта
plt.figure(figsize=(10, 6))
sns.boxplot(x='experience_level', y='salary_in_usd', data=df)
plt.title('Зависимость зарплаты от уровня опыта')
plt.xlabel('Уровень опыта')
plt.ylabel('Зарплата (USD)')
plt.show()
Теперь перейдем к делению на выборки и созданию ориентира
In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)
# Разделение на признаки и целевую переменную
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']
target = 'above_median_salary'
# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42, stratify=df[target])
print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)
# Создание ориентира (baseline)
baseline_threshold = y_train.mean()
baseline_predictions = [1 if pred > baseline_threshold else 0 for pred in [baseline_threshold] * len(y_test)]
# Вычисление метрик для ориентира
baseline_accuracy = accuracy_score(y_test, baseline_predictions)
baseline_f1 = f1_score(y_test, baseline_predictions, average='weighted')
print('Baseline Accuracy:', baseline_accuracy)
print('Baseline F1 Score:', baseline_f1)
Создание конвейера и обучение моделей
In [ ]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)
# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']
# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Определение столбцов
numeric_columns = ["work_year", "salary", "remote_ratio"]
cat_columns = ["experience_level", "employment_type", "job_title", "salary_currency", "employee_residence", "company_location", "company_size"]
# Предобработка данных
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_columns),
('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)])
# Создание конвейеров для моделей
pipeline_logistic_regression = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42))])
pipeline_decision_tree = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier(random_state=42))])
pipeline_gradient_boosting = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(random_state=42))])
# Список конвейеров
pipelines = [
('Logistic Regression', pipeline_logistic_regression),
('Decision Tree', pipeline_decision_tree),
('Gradient Boosting', pipeline_gradient_boosting)
]
# Обучение моделей и вывод результатов
for name, pipeline in pipelines:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Model: {name}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("-" * 40)
Оценка качества моделей
In [27]:
from sklearn.metrics import accuracy_score, f1_score
for name, pipeline in pipelines:
y_pred = pipeline.predict(X_test)
print(f"Model: {name}")
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))
print()
Регрессия Цель: Разработать модель регрессии, которая будет предсказывать зарплату (salary_in_usd) на основе демографических данных, типа работы и других факторов.
In [28]:
import pandas as pd
from scipy import stats
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# Определение числовых признаков
numeric_features = ['work_year', 'salary', 'salary_in_usd', 'remote_ratio']
# Вычисление z-оценок для числовых признаков
z_scores = stats.zscore(df[numeric_features])
# Определение порога для удаления выбросов
threshold = 3
# Удаление выбросов
df_cleaned = df[(z_scores < threshold).all(axis=1)]
print("Размер данных до удаления выбросов:", df.shape)
print("Размер данных после удаления выбросов:", df_cleaned.shape)
In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Определение признаков и целевой переменной
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']
target = 'salary_in_usd'
# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)
print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)
# Создание ориентира (baseline)
baseline_predictions = [y_train.mean()] * len(y_test)
# Вычисление метрик для ориентира
print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))
print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))
print('Baseline R²:', r2_score(y_test, baseline_predictions))
In [31]:
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# Определение числовых признаков
numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']
# Вычисление z-оценок для числовых признаков
z_scores = stats.zscore(df[numeric_features])
# Определение порога для удаления выбросов
threshold = 3
# Удаление выбросов
df_cleaned = df[(z_scores < threshold).all(axis=1)]
print("Размер данных до удаления выбросов:", df.shape)
print("Размер данных после удаления выбросов:", df_cleaned.shape)
# Разделение на выборки и создание ориентира
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'salary_currency', 'remote_ratio', 'employee_residence', 'company_location', 'company_size']
target = 'salary_in_usd'
X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)
print("Размер обучающей выборки:", X_train.shape)
print("Размер тестовой выборки:", X_test.shape)
# Создание ориентира (baseline)
baseline_predictions = [y_train.mean()] * len(y_test)
print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))
print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))
print('Baseline R²:', r2_score(y_test, baseline_predictions))
# Создание конвейера и обучение моделей
categorical_features = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])
pipeline_linear_regression = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())])
pipeline_decision_tree = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', DecisionTreeRegressor(random_state=42))])
pipeline_gradient_boosting = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', GradientBoostingRegressor(random_state=42))])
pipelines = [
('Linear Regression', pipeline_linear_regression),
('Decision Tree', pipeline_decision_tree),
('Gradient Boosting', pipeline_gradient_boosting)
]
for name, pipeline in pipelines:
pipeline.fit(X_train, y_train)
print(f"Model: {name} trained.")
# Оценка качества моделей
for name, pipeline in pipelines:
y_pred = pipeline.predict(X_test)
print(f"Model: {name}")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R²:', r2_score(y_test, y_pred))
print()
In [32]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
for name, pipeline in pipelines:
y_pred = pipeline.predict(X_test)
print(f"Model: {name}")
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R²:', r2_score(y_test, y_pred))
print()
In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# Проверка на пропущенные значения
print("Пропущенные значения:\n", df.isnull().sum())
# Удаление строк с пропущенными значениями
df = df.dropna()
# Выбор признаков и целевой переменной
features = ['work_year', 'experience_level', 'employment_type', 'job_title', 'employee_residence', 'remote_ratio', 'company_location', 'company_size']
target = 'salary_in_usd'
# Определение категориальных и числовых признаков
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
# Создание пайплайна для обработки данных
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Преобразование данных
X = preprocessor.fit_transform(df[features])
y = df[target]
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scipy.stats import uniform, randint
# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")
# ... (ваш код предобработки данных, как в предыдущем примере) ...
# Определение распределений для гиперпараметров
param_distributions = {
'Linear Regression': {
'regressor__fit_intercept': [True, False],
'regressor__positive': [True, False]
},
'Random Forest': {
'regressor__n_estimators': randint(50, 200),
'regressor__max_depth': [None, 10, 20],
'regressor__min_samples_split': randint(2, 11),
'regressor__min_samples_leaf': randint(1, 5),
'regressor__bootstrap': [True, False]
},
'Gradient Boosting': {
'regressor__n_estimators': randint(50, 200),
'regressor__learning_rate': uniform(0.01, 0.49), # uniform distribution for learning rate
'regressor__max_depth': [3, 5, 7],
'regressor__min_samples_split': randint(2, 11),
'regressor__min_samples_leaf': randint(1, 5),
'regressor__subsample': uniform(0.5, 0.5) # uniform distribution for subsample
}
}
# Словарь для хранения лучших моделей и их гиперпараметров
best_models = {}
# Цикл для обучения и настройки гиперпараметров каждой модели
for model_name, model_params in param_distributions.items():
if model_name == 'Linear Regression':
model = LinearRegression()
elif model_name == 'Random Forest':
model = RandomForestRegressor(random_state=42)
elif model_name == 'Gradient Boosting':
model = GradientBoostingRegressor(random_state=42)
else:
continue #Обработка неизвестных моделей
pipeline = Pipeline([('regressor', model)])
random_search = RandomizedSearchCV(pipeline, param_distributions=model_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
best_models[model_name] = random_search.best_params_
# Визуализация лучших гиперпараметров
fig, axes = plt.subplots(len(best_models), 1, figsize=(10, 5 * len(best_models)))
if len(best_models) == 1:
axes = [axes] # обработка случая с одной моделью
for i, (model_name, params) in enumerate(best_models.items()):
axes[i].bar(params.keys(), params.values())
axes[i].set_title(f"Лучшие гиперпараметры для {model_name}")
axes[i].set_xticklabels(params.keys(), rotation=45, ha="right") #Поворачиваем подписи на оси х
axes[i].tick_params(axis='x', which='major', labelsize=8) # Размер шрифта подписей оси х
plt.tight_layout()
plt.show()