AIM-PIbd-32-Kaznacheeva-E-K/lab_4/Lab4.ipynb
2024-11-23 12:17:48 +04:00

201 KiB
Raw Blame History

Начало лабораторной работы

In [8]:
import pandas as pd
df = pd.read_csv("..//static//csv//ds_salaries.csv")
print(df.columns)
Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')
In [10]:
df.head()
Out[10]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2023 SE FT Principal Data Scientist 80000 EUR 85847 ES 100 ES L
1 2023 MI CT ML Engineer 30000 USD 30000 US 100 US S
2 2023 MI CT ML Engineer 25500 USD 25500 US 100 US S
3 2023 SE FT Data Scientist 175000 USD 175000 CA 100 CA M
4 2023 SE FT Data Scientist 120000 USD 120000 CA 100 CA M
In [9]:
df.describe()
Out[9]:
work_year salary salary_in_usd remote_ratio
count 3755.000000 3.755000e+03 3755.000000 3755.000000
mean 2022.373635 1.906956e+05 137570.389880 46.271638
std 0.691448 6.716765e+05 63055.625278 48.589050
min 2020.000000 6.000000e+03 5132.000000 0.000000
25% 2022.000000 1.000000e+05 95000.000000 0.000000
50% 2022.000000 1.380000e+05 135000.000000 0.000000
75% 2023.000000 1.800000e+05 175000.000000 100.000000
max 2023.000000 3.040000e+07 450000.000000 100.000000
In [11]:
# Процент пропущенных значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f'{i} Процент пустых значений: %{null_rate:.2f}')

print(df.isnull().sum())

print(df.isnull().any())
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
work_year             False
experience_level      False
employment_type       False
job_title             False
salary                False
salary_currency       False
salary_in_usd         False
employee_residence    False
remote_ratio          False
company_location      False
company_size          False
dtype: bool
In [13]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']

# Примерная категоризация
df['salary_category'] = pd.cut(df['salary_in_usd'], bins=[0, 100000, 200000, np.inf], labels=[0, 1, 2])

# Выбор признаков и целевых переменных
X = df.drop(columns=['salary_in_usd', 'salary_category'])

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)

    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="above_median_salary", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

# Проверка преобразования
print(df.dtypes)

# Визуализация распределения зарплат
plt.figure(figsize=(10, 6))
sns.histplot(df['salary_in_usd'], bins=50, kde=True)
plt.title('Распределение зарплат')
plt.xlabel('Зарплата (USD)')
plt.ylabel('Частота')
plt.show()

# Визуализация зависимости между зарплатой и уровнем опыта
plt.figure(figsize=(10, 6))
sns.boxplot(x='experience_level', y='salary_in_usd', data=df)
plt.title('Зависимость зарплаты от уровня опыта')
plt.xlabel('Уровень опыта')
plt.ylabel('Зарплата (USD)')
plt.show()
'X_train'
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size above_median_salary salary_category
1809 2023 SE FT Data Engineer 182000 USD 182000 US 100 US M 1 1
1082 2023 SE FT Machine Learning Engineer 126000 USD 126000 US 0 US M 0 1
1686 2023 SE FT BI Developer 140000 USD 140000 US 100 US M 1 1
1600 2023 SE FT Data Scientist 140000 USD 140000 US 0 US M 1 1
1376 2023 SE FT Data Engineer 226700 USD 226700 US 0 US M 1 2
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2706 2022 SE FT Data Engineer 160000 USD 160000 US 100 US M 1 1
928 2023 MI FT Data Engineer 200000 USD 200000 US 0 US M 1 1
564 2023 MI FT Data Engineer 140000 USD 140000 US 0 US M 1 1
716 2023 SE FT Data Scientist 297300 USD 297300 US 100 US M 1 2
1299 2023 SE FT Data Engineer 133832 USD 133832 US 0 US M 0 1

3004 rows × 13 columns

'y_train'
above_median_salary
1809 1
1082 0
1686 1
1600 1
1376 1
... ...
2706 1
928 1
564 1
716 1
1299 0

3004 rows × 1 columns

'X_test'
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size above_median_salary salary_category
3459 2022 MI FT Research Scientist 59000 EUR 61989 AT 0 AT L 0 0
3724 2021 EN FT Business Data Analyst 50000 EUR 59102 LU 100 LU L 0 0
1795 2023 SE FT Data Engineer 180000 USD 180000 US 0 US M 1 1
3535 2021 MI FT Data Scientist 50000 USD 50000 NG 100 NG L 0 0
3255 2022 MI FT Data Analyst 106260 USD 106260 US 0 US M 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1943 2022 MI FT Data Engineer 120000 USD 120000 US 100 US M 0 1
573 2023 EN FT Autonomous Vehicle Technician 7000 USD 7000 GH 0 GH S 0 0
3013 2022 SE FT Machine Learning Engineer 129300 USD 129300 US 0 US M 0 1
327 2023 EN FT Data Scientist 70000 CAD 51753 CA 100 CA L 0 0
1565 2023 SE FT Data Analyst 48000 EUR 51508 ES 0 ES M 0 0

751 rows × 13 columns

'y_test'
above_median_salary
3459 0
3724 0
1795 1
3535 0
3255 0
... ...
1943 0
573 0
3013 0
327 0
1565 0

751 rows × 1 columns

work_year                 int64
experience_level         object
employment_type          object
job_title                object
salary                    int64
salary_currency          object
salary_in_usd             int64
employee_residence       object
remote_ratio              int64
company_location         object
company_size             object
above_median_salary       int64
salary_category        category
dtype: object
No description has been provided for this image
No description has been provided for this image
In [ ]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Загрузка данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Создание целевого признака
median_salary = df['salary_in_usd'].median()
df['above_median_salary'] = np.where(df['salary_in_usd'] > median_salary, 1, 0)

# Разделение на признаки и целевую переменную
X = df.drop(columns=['salary_in_usd', 'above_median_salary'])
y = df['above_median_salary']

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Построение конвейеров предобработки

class SalaryFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Создание новых признаков
        X = X.copy()
        X["work_year_to_remote_ratio"] = X["work_year"] / X["remote_ratio"]
        return X
    def get_feature_names_out(self, features_in):
        # Добавление имен новых признаков
        new_features = ["work_year_to_remote_ratio"]
        return np.append(features_in, new_features, axis=0)

# Обработка числовых данных. Числовой конвейер: заполнение пропущенных значений медианой и стандартизация
preprocessing_num_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding
preprocessing_cat_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Определение столбцов
numeric_columns = ["work_year", "salary", "salary_in_usd", "remote_ratio"]
cat_columns = ["experience_level", "employment_type", "job_title", "salary_currency", "employee_residence", "company_location", "company_size"]

# Предобработка признаков
features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num_class, numeric_columns),
        ("prepocessing_cat", preprocessing_cat_class, cat_columns),
    ],
    remainder="passthrough"
)

# Удаление колонок
columns_to_drop = []  # Укажите столбцы, которые нужно удалить, если они есть
drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

# Основной конвейер предобработки данных и конструирования признаков
pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("custom_features", SalaryFeatures()),
        ("drop_columns", drop_columns),
    ]
)

# Демонстрация работы конвейера для предобработки данных при классификации
preprocessing_result = pipeline_end.fit_transform(X_train)

# Получение имен столбцов после преобразования
feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out(numeric_columns + cat_columns)
feature_names = np.append(feature_names, ["work_year_to_remote_ratio"])

# Создание DataFrame с преобразованными данными
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=feature_names,
)

preprocessed_df
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[14], line 71
     62 pipeline_end = Pipeline(
     63     [
     64         ("features_preprocessing", features_preprocessing),
   (...)
     67     ]
     68 )
     70 # Демонстрация работы конвейера для предобработки данных при классификации
---> 71 preprocessing_result = pipeline_end.fit_transform(X_train)
     72 preprocessed_df = pd.DataFrame(
     73     preprocessing_result,
     74     columns=pipeline_end.get_feature_names_out(),
     75 )
     77 preprocessed_df

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py:533, in Pipeline.fit_transform(self, X, y, **params)
    490 """Fit the model and transform with the final estimator.
    491 
    492 Fit all the transformers one after the other and sequentially transform
   (...)
    530     Transformed samples.
    531 """
    532 routed_params = self._check_method_params(method="fit_transform", props=params)
--> 533 Xt = self._fit(X, y, routed_params)
    535 last_step = self._final_estimator
    536 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py:406, in Pipeline._fit(self, X, y, routed_params)
    404     cloned_transformer = clone(transformer)
    405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
    407     cloned_transformer,
    408     X,
    409     y,
    410     None,
    411     message_clsname="Pipeline",
    412     message=self._log_message(step_idx),
    413     params=routed_params[name],
    414 )
    415 # Replace the transformer of the step with the fitted
    416 # transformer. This is necessary when loading the transformer
    417 # from the cache.
    418 self.steps[step_idx] = (name, fitted_transformer)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\joblib\memory.py:312, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    311 def __call__(self, *args, **kwargs):
--> 312     return self.func(*args, **kwargs)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\pipeline.py:1310, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, params)
   1308 with _print_elapsed_time(message_clsname, message):
   1309     if hasattr(transformer, "fit_transform"):
-> 1310         res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311     else:
   1312         res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313             X, **params.get("transform", {})
   1314         )

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\base.py:1098, in TransformerMixin.fit_transform(self, X, y, **fit_params)
   1083         warnings.warn(
   1084             (
   1085                 f"This object ({self.__class__.__name__}) has a `transform`"
   (...)
   1093             UserWarning,
   1094         )
   1096 if y is None:
   1097     # fit method of arity 1 (unsupervised transformation)
-> 1098     return self.fit(X, **fit_params).transform(X)
   1099 else:
   1100     # fit method of arity 2 (supervised transformation)
   1101     return self.fit(X, y, **fit_params).transform(X)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

Cell In[14], line 18, in SalaryFeatures.transform(self, X, y)
     15 def transform(self, X, y=None):
     16     # Создание новых признаков
     17     X = X.copy()
---> 18     X["work_year_to_remote_ratio"] = X["work_year"] / X["remote_ratio"]
     19     return X

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_csr.py:24, in _csr_base.__getitem__(self, key)
     22 def __getitem__(self, key):
     23     if self.ndim == 2:
---> 24         return super().__getitem__(key)
     26     if isinstance(key, tuple) and len(key) == 1:
     27         key = key[0]

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_index.py:52, in IndexMixin.__getitem__(self, key)
     51 def __getitem__(self, key):
---> 52     row, col = self._validate_indices(key)
     54     # Dispatch to specialized methods.
     55     if isinstance(row, INT_TYPES):

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_index.py:186, in IndexMixin._validate_indices(self, key)
    184     row = _validate_bool_idx(bool_row, M, "row")
    185 elif not isinstance(row, slice):
--> 186     row = self._asindices(row, M)
    188 if isintlike(col):
    189     col = int(col)

File d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\scipy\sparse\_index.py:212, in IndexMixin._asindices(self, idx, length)
    209     raise IndexError('invalid index') from e
    211 if x.ndim not in (1, 2):
--> 212     raise IndexError('Index dimension must be 1 or 2')
    214 if x.size == 0:
    215     return x

IndexError: Index dimension must be 1 or 2

Бизнес-цели

  1. Предсказание заработной платы (Регрессия)

    Цель: Предсказать заработную плату (salary_in_usd) на основе других характеристик, таких как уровень опыта (experience_level), тип занятости (employment_type), должность (job_title), место проживания сотрудника (employee_residence), размер компании (company_size) и другие факторы.

    Применение: Это может быть полезно для HR-отделов, которые хотят оценить справедливую зарплату для новых сотрудников или для анализа рынка труда.

  2. Классификация уровня опыта по зарплате (Классификация)

    Цель: Классифицировать уровень опыта (experience_level) на основе заработной платы (salary_in_usd) и других факторов.

    Применение: Это может помочь в оценке, на каком уровне опыта находится сотрудник, основываясь на его зарплате, что может быть полезно для оценки карьерного роста.

  1. Прогнозирование зарплаты
In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# Загружаем набор данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Устанавливаем случайное состояние
random_state = 42

# Предварительный анализ данных
print(df.head())
print(df.info())
print(df.describe())

# Проверка на пропущенные значения
print(df.isnull().sum())

# Предобработка данных
# Определяем категориальные и числовые столбцы
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']

# Создаем пайплайн для обработки данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Определяем целевую переменную и признаки
X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Создаем и обучаем модель
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())])

model.fit(X_train, y_train)

# Делаем предсказания на тестовой выборке
y_pred = model.predict(X_test)

# Оцениваем качество модели
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
   work_year experience_level employment_type                 job_title  \
0       2023               SE              FT  Principal Data Scientist   
1       2023               MI              CT               ML Engineer   
2       2023               MI              CT               ML Engineer   
3       2023               SE              FT            Data Scientist   
4       2023               SE              FT            Data Scientist   

   salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0   80000             EUR          85847                 ES           100   
1   30000             USD          30000                 US           100   
2   25500             USD          25500                 US           100   
3  175000             USD         175000                 CA           100   
4  120000             USD         120000                 CA           100   

  company_location company_size  
0               ES            L  
1               US            S  
2               US            S  
3               CA            M  
4               CA            M  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
None
         work_year        salary  salary_in_usd  remote_ratio
count  3755.000000  3.755000e+03    3755.000000   3755.000000
mean   2022.373635  1.906956e+05  137570.389880     46.271638
std       0.691448  6.716765e+05   63055.625278     48.589050
min    2020.000000  6.000000e+03    5132.000000      0.000000
25%    2022.000000  1.000000e+05   95000.000000      0.000000
50%    2022.000000  1.380000e+05  135000.000000      0.000000
75%    2023.000000  1.800000e+05  175000.000000    100.000000
max    2023.000000  3.040000e+07  450000.000000    100.000000
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
Mean Squared Error: 2482079980.9527493
R^2 Score: 0.37127352660208646
  1. Классифицировать уровень опыта
In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


# Загружаем набор данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Устанавливаем случайное состояние
random_state = 42


# Предобработка данных
# Определяем категориальные и числовые столбцы
categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']

# Создаем пайплайн для обработки данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Определяем целевую переменную и признаки
X = df.drop('experience_level', axis=1)
y = df['experience_level']

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Создаем и обучаем модель
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))])

model.fit(X_train, y_train)

# Делаем предсказания на тестовой выборке
y_pred = model.predict(X_test)

# Оцениваем качество модели
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

# Визуализация результатов
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
Classification Report:
              precision    recall  f1-score   support

          EN       0.55      0.48      0.51        67
          EX       0.46      0.26      0.33        23
          MI       0.48      0.54      0.51       157
          SE       0.83      0.83      0.83       504

    accuracy                           0.72       751
   macro avg       0.58      0.53      0.55       751
weighted avg       0.72      0.72      0.72       751

Confusion Matrix:
[[ 32   0  20  15]
 [  0   6   5  12]
 [ 14   0  84  59]
 [ 12   7  65 420]]
Accuracy Score: 0.7217043941411452
No description has been provided for this image

Ориентир

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Предобработка данных
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# Проверяем, достигнуты ли ориентиры
if r2 >= 0.75 and mae <= 15000 and rmse <= 20000:
    print("Ориентиры для предсказания заработной платы достигнуты!")
else:
    print("Ориентиры для предсказания заработной платы не достигнуты.")
MAE: 37795.639591701794
MSE: 2482079980.9527493
RMSE: 49820.47752634201
R²: 0.37127352660208646
Ориентиры для предсказания заработной платы не достигнуты.
d:\MII\AIM-PIbd-32-Kaznacheeva-E-K\aimenv\Lib\site-packages\sklearn\metrics\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Загружаем набор данных
df = pd.read_csv("..//static//csv//ds_salaries.csv")

# Предобработка данных
categorical_features = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'salary_in_usd', 'remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X = df.drop('experience_level', axis=1)
y = df['experience_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Проверяем, достигнуты ли ориентиры
if accuracy >= 0.80:
    print("Ориентиры для классификации уровня опыта достигнуты!")
else:
    print("Ориентиры для классификации уровня опыта не достигнуты.")
Accuracy: 0.7217043941411452
Classification Report:
              precision    recall  f1-score   support

          EN       0.55      0.48      0.51        67
          EX       0.46      0.26      0.33        23
          MI       0.48      0.54      0.51       157
          SE       0.83      0.83      0.83       504

    accuracy                           0.72       751
   macro avg       0.58      0.53      0.55       751
weighted avg       0.72      0.72      0.72       751

Confusion Matrix:
[[ 32   0  20  15]
 [  0   6   5  12]
 [ 14   0  84  59]
 [ 12   7  65 420]]
Ориентиры для классификации уровня опыта не достигнуты.

Конвейер

In [7]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Определение столбцов
numeric_columns = ["work_year", "salary", "salary_in_usd", "remote_ratio"]
cat_columns = ["experience_level", "employment_type", "job_title", "salary_currency", "employee_residence", "company_location", "company_size"]

# Обработка числовых данных: заполнение пропущенных значений медианой и стандартизация
preprocessing_num_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Обработка категориальных данных: заполнение пропущенных значений наиболее частым значением и one-hot encoding
preprocessing_cat_class = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Объединение всех преобразований в один ColumnTransformer
features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num_class, numeric_columns),
        ("prepocessing_cat", preprocessing_cat_class, cat_columns),
    ],
    remainder="passthrough"
)

# Определение конвейера
pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
    ]
)
In [5]:
# Разделение данных на тренировочный и тестовый наборы
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Применение конвейера для предобработки данных
preprocessing_result = pipeline_end.fit_transform(X_train)

# Получение имен столбцов после преобразования
feature_names = pipeline_end.named_steps['features_preprocessing'].get_feature_names_out()

# Создание DataFrame с преобразованными данными
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=feature_names,
)

# Вывод преобразованного DataFrame
print(preprocessed_df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 2
      1 # Разделение данных на тренировочный и тестовый наборы
----> 2 X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)
      4 # Применение конвейера для предобработки данных
      5 preprocessing_result = pipeline_end.fit_transform(X_train)

NameError: name 'train_test_split' is not defined